In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

def is_number(variable):
    dot_count = 0
    for char in variable:
        if not char.isdigit() and char != '.':
            return False
        if char == '.':
            dot_count += 1
            if dot_count > 1:
                return False
    return True
    
# Initialize empty lists to store data
all_product_urls = []
all_product_titles = []
all_countries = []
all_cultivations = []
all_botanical_families = []
all_plant_parts = []
all_chemical_families = []
all_extraction_methods = []
all_therapeutic_properties = []
all_aromas = []
all_notes = []
all_suggested_uses = []
all_sorted_sections = []

# Initialize Selenium WebDriver
driver = webdriver.Chrome()

# Loop through each page
for page_num in range(1, 11):
    # URL of the current page
    url = f'https://www.aromatics.com/collections/essential-oil-singles?page={page_num}'

    # Fetch the HTML content of the webpage
    response = requests.get(url)
    html_content = response.text

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the <a> elements containing product information
    product_links = soup.find_all('a', class_='product-name')

    # Extract product details from each product page
    for link in product_links:
        # Extract product URL
        product_url = 'https://www.aromatics.com' + link['href']
        driver.get(product_url)
        # Wait for the dynamic content to load (you may need to adjust the wait time as needed)
        time.sleep(10)
        product_html_content = driver.page_source
        product_soup = BeautifulSoup(product_html_content, 'html.parser')
        
        # Extract product title
        product_title_div = product_soup.find('div', class_='product__title')
        product_title_tag = product_title_div.find('h1', class_='h2 m-0')
        product_title = product_title_tag.text.strip() if product_title_tag else "Product title not found"

        # Extract product properties
        product_properties_div = product_soup.find_all('div', class_='product-properties flex grid grid--3-col-tablet grid--3-col-desktop grid--3-col')
        # Definning properties to be extracted
        properties = ["Country", "Cultivation", "Botanical Families", "Plant Part", "Chemical Families", "Extraction Method"]

        # Initialize variables to store the extracted values
        country = cultivation = botanical_families = plant_part = chemical_families = extraction_method = []

        # Find all items with class 'product-properties__item'
        items = product_properties_div[0].find_all(class_="product-properties__item")

        # Loop through each item
        for item in items:
            # Find the title and content elements
            title_element = item.find(class_="product-properties__title")
            content_elements = item.find_all(class_="product-properties__content")
    
            # Extract the text from title and content elements
            title = title_element.get_text(strip=True)
            content = content_elements[0].get_text(strip=True) if content_elements else ""
    
            # Check if the title is in the properties list and assign the content to the corresponding variable
            if title == "Country":
                country = content
            elif title == "Cultivation":
                cultivation = content
            elif title == "Botanical Families":
                botanical_families = content
            elif title == "Plant Part":
                plant_part = content
            elif title == "Chemical Families":
                chemical_families.extend([content.get_text(strip=True) for content in content_elements])
            elif title == "Extraction Method":
                extraction_method = content 

        # Extract therapeutic & aromatic properties
        product_therapeutic_div = product_properties_div[1]
        
        # Definning properties to be extracted
        properties_therapeutic=["Therapeutic Properties","Notes","Aromas"]
        
        # Initialize variables to store the extracted values
        therapeutic_properties = []
        notes = []
        aromas = []
        
        # Find all items with class 'product-properties__item'
        therapeutic_items = product_therapeutic_div.find_all(class_="product-properties__item")

        # Loop through each item
        for item in therapeutic_items:
            
            # Find the title and content elements
            title_element = item.find(class_="product-properties__title")
            content_elements = item.find_all(class_="product-properties__content")
            
            # Extract the text from title and content elements
            if title_element:
               title = title_element.get_text(strip=True)
            

            # Check if the title is in the properties list and assign the content to the corresponding variable
            if title == "Therapeutic Properties":
                for content in content_elements:
                  therapeutic_properties.append(content.get_text(strip=True))
            elif title == "Notes":
                for content1 in content_elements:
                  notes.append(content1.get_text(strip=True))
            elif title == "Aromas":
                for content2 in content_elements:
                  aromas.append(content2.get_text(strip=True))
                
        # Extract suggested uses
        suggested_uses_div = product_soup.find_all('div', class_='accordion__content body-big rte')

        # Initialize variables to store the extracted values
        suggested_uses = []

        # Find all items with 'strong' tags
        suggested_items = suggested_uses_div[2].find_all('strong')

        # Loop through each item
        for item in suggested_items:
            suggested_uses.append(item.get_text(strip=True))

        # Extract GCMS Data
        gcms_div = product_soup.find('div', class_='gcms-data__container')
        if gcms_div:
            link = gcms_div.find('a')
            gcms_href ='https://www.aromatics.com' + link['href']
            driver.get(gcms_href)
            time.sleep(5)
            
            # Pressing compare batches
            div_element = driver.find_element(By.CSS_SELECTOR, 'div.inline-block.align-top')
            div_element.click()
            time.sleep(5)

            #extract data using Selenium from table body
            gcms_table = driver.find_elements(By.TAG_NAME, "tbody")
            gcms_table_text = [element.text for element in gcms_table]

            # Initialize a dictionary to store the sorted sections
            sorted_sections = {}

            # Iterate over each section in the list
            for section in gcms_table_text:
                # Split the section into lines
                lines = section.split('\n')
                # Extract the section name
                section_name = lines[0]
                # Split the section into assets
                assets = lines[1:]
                # Initialize a list to store sorted assets
                sorted_assets = []
                # Iterate over each asset in the section
                i = 0
                while i < len(assets):
                    # Extract asset name
                    asset_name = assets[i]
                    # Extract numeric values
                    numeric_values = []
                    i += 1  # Move to the next line for numeric values
                    while i < len(assets) :  # Check if the line starts with a dash
                        if assets[i] == '-':
                            numeric_values.append(0)
                        elif is_number(assets[i].rstrip('%')):
                            numeric_values.append(float(assets[i].rstrip('%')))
                        else:
                            break
                        i += 1
                    # Create a tuple containing asset name and numeric values
                    asset_tuple = (asset_name, numeric_values)
                    # Append the asset tuple to the list of sorted assets
                    sorted_assets.append(asset_tuple)
                # Store the sorted assets in the dictionary
                sorted_sections[section_name] = sorted_assets

            

        # Append data to respective lists
        all_product_urls.append(product_url)
        all_product_titles.append(product_title)
        all_countries.append(country)
        all_cultivations.append(cultivation)
        all_botanical_families.append(botanical_families)
        all_plant_parts.append(plant_part)
        all_chemical_families.append(chemical_families)
        all_extraction_methods.append(extraction_method)
        all_therapeutic_properties.append(therapeutic_properties)
        all_aromas.append(aromas)
        all_notes.append(notes)
        all_suggested_uses.append(suggested_uses)
        all_sorted_sections.append(sorted_sections)
      

# Create a DataFrame from the extracted data
df = pd.DataFrame({
    'Product URL': all_product_urls,
    'Product Title': all_product_titles,
    'Country': all_countries,
    'Cultivation': all_cultivations,
    'Botanical Families': all_botanical_families,
    'Plant Part': all_plant_parts,
    'Chemical Families': all_chemical_families,
    'Extraction Method': all_extraction_methods,
    'Therapeutic & Aromatic Properties': all_therapeutic_properties,
    'Notes': all_notes,
    'Aroma': all_aromas,
    'Suggested uses': all_suggested_uses,
    'GCMS analysis': all_sorted_sections
    })

# Close the WebDriver
driver.quit()

# Specify the file path where csv file saved
file_path = 'aromatics_data.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)

# Print a message to confirm that the file has been saved
print(f"DataFrame has been saved to '{file_path}'")

DataFrame has been saved to 'aromatics_data.csv'
