### Import required libraries

In [1]:
import os 
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Get the response from the url

In [2]:
url = "https://www.adobe.com/legal/licenses-terms.html"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)

### Retrieve the html content of the url

In [3]:
soup = BeautifulSoup(response.content, "html.parser")

### Retriving resource name

In [4]:
resource_name = soup.find("div", id="root_content_position_2fc2_position-par_position_2fc2_0")

In [5]:
resource_names= resource_name.find_all("p")

In [6]:
resource_name_list=[]
for name in resource_names:
    p_tag=name.text
    resource_name_list.append(p_tag)

In [7]:
import re  # Import the re module for regular expressions

first_sentence = resource_name_list[0]
filtered_resource_names = [first_sentence]  # Initialize with the first sentence
removal_words = ["General", "License", "Terms", "EULA", "Product", "Available", "Agreement", "contract", "2023"]

# Create a new list to store non-digit sentences
non_digit_sentences = []

for sentence in resource_name_list[1:]:  # Start from the second sentence
    # Check if the sentence contains only digits or floating-point numbers
    if re.match(r'^-?\d+\.?\d*$', sentence.strip()):
        continue  # Skip sentences that are numeric
    non_digit_sentences.append(sentence)  # Add non-numeric sentences to the new list

# Filter non-numeric sentences based on removal_words
for sentence in non_digit_sentences:
    if not any(word.lower() in sentence.lower() for word in removal_words):
        filtered_resource_names.append(sentence)
filtered_text_list = []
# Print the filtered sentences
for idx, sentence in enumerate(filtered_resource_names, start=1):
    filtered_text_list.append(sentence)
filtered_text_list = [s.replace('\n', '').replace('\xa0', '') for s in filtered_text_list]
filtered_text_list = [s for s in filtered_text_list if s.strip()]


### Retriving resource data

In [8]:
resource_urls = soup.find_all("p")

In [9]:
resource_url_list = []
for i in resource_urls[1:]:
    a_tag = i.find_all('a')
    if a_tag:
        resource_url_list.append(a_tag[0]['href'])

In [10]:
del resource_url_list[0]

In [11]:
len(filtered_text_list)

101

In [12]:
len(resource_url_list)

101

In [13]:
resource_url_list

['https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://www.adobe.com/legal/terms.html',
 'https://w

In [14]:
modified_url_list = []
for url in resource_url_list:
    if url.lower().endswith('.pdf'):
        modified_url = "https://www.adobe.com" + url
        modified_url_list.append(modified_url)
    else:
        modified_url_list.append(url)

In [16]:
# Create a folder for each file and download the corresponding PDF file or retrieve text from HTML
for name, url in zip(filtered_text_list, modified_url_list):
    # Sanitize the folder name to remove special characters and spaces
    sanitized_name = re.sub(r'[^\w\s]', '', name)
    sanitized_name = sanitized_name.replace(' ', '_')  # Replace spaces with underscores

    # Create a folder for the file and save it in the folder
    folder_path = os.path.join(os.getcwd(), sanitized_name)
    os.makedirs(folder_path, exist_ok=True)

    # Check if the URL ends with .pdf or .html
    if url.lower().endswith('.html'):
        # Retrieve text from the HTML file and save it in a text file in the folder
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            lic_term = soup.find_all("div", id= "root_content_flex")
            for i in lic_term:
                text = i.text
                text_file_path = os.path.join(folder_path, f"License_guidelines.txt")
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(text)
                print(f"Retrieved text from HTML: {url}")
        else:
            print(f"Failed to retrieve text from HTML: {url}")
    else:
        # Download the PDF file and save it in the folder
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            pdf_file_name = os.path.basename(url)
            simplified_pdf_file_name = re.sub(r'[^\w\s.-]', '', pdf_file_name)
            pdf_file_path = os.path.join(folder_path, f"{simplified_pdf_file_name}.pdf")
            with open(pdf_file_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded PDF: {pdf_file_path}")
        
    



Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: https://www.adobe.com/legal/terms.html
Retrieved text from HTML: