In [None]:
import pandas as pd
import os
import requests
import regex as re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
# Load the pandas DataFrame with your data
df = pd.read_csv('chemical.csv') ## Dataframe contains Name column which contains chemical names
df = df.head(50)
def process_chemical(chemical_name):
    # Initialize the WebDriver
    driver = webdriver.Chrome()
    
    # Load the PubChem page
    url = f'https://pubchem.ncbi.nlm.nih.gov/compound/{chemical_name.replace(" ", "%20")}'
    driver.get(url)
    driver.minimize_window()
    
    # Check if the page exists
    try:
        driver.find_element(By.XPATH, '//h1[contains(text(), "Page Not Found")]')
        print(f"Compound {chemical_name} not found")
        driver.quit()
        return
    except NoSuchElementException:
        pass
    
    button = driver.find_element(By.XPATH, '//*[@id="page-download-btn"]')
    button.click()
    button1 = driver.find_element(By.XPATH,'//*[@id="root-modal"]/div[2]/div/div[2]/div/div/div[2]/div/div[1]/ul/li[2]/div/a[2]')
    button1.click()
    
    # Wait for the new page to load and get the URL
    driver.switch_to.window(driver.window_handles[1])
    url = driver.current_url
    response = requests.get(url)
    
    # Extract data between <String> tags using regex
    pattern = r"<String>([A-Za-z0-9\-.,() ]*)\.</String>"
    file_contents = response.content.decode("utf-8")
    matches = re.findall(pattern, file_contents, re.DOTALL)
    
    # Filter lines that contain "LLC" or "LTD"
    filtered_lines = [line for line in matches if not re.search(r"\bLLC\b|\bInc\b", line)]
    
    # Write the filtered data to a new text file
    output_file = f"{chemical_name}_filtered_data.txt"
    with open(output_file, "w", encoding="utf-8") as file:
        for line in filtered_lines:
            file.write(line + ".\n")
    print("Filtered data saved to", output_file)
    
    # Close the WebDriver
    driver.quit()
# Get the absolute path of the current working directory
current_dir = os.path.abspath('.')
# Iterate over each row of the DataFrame and create folders for each name
for _, row in df.iterrows():
    folder_name = str(row['name'])
    folder_path = os.path.join(current_dir, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    
    # Iterate over each synonym and scrape the description
    synonym_list = row['synonym']
    synonym_list = str(synonym_list).strip().split('; ')
    
    for synonym in synonym_list:
        process_chemical(synonym)
    
    # Move the filtered data file to the folder
        filtered_data_file_name = f"{synonym}_filtered_data.txt"
        filtered_data_file_path = os.path.join(current_dir, filtered_data_file_name)
    
        if os.path.exists(filtered_data_file_path):
            new_filtered_data_file_path = os.path.join(folder_path, filtered_data_file_name)
            os.rename(filtered_data_file_path, new_filtered_data_file_path)
        
        # Extract the first 6 lines from the filtered data file
            description_file_name = f"{synonym}_description.txt"
        # Remove special characters from the file name
            description_file_name = ''.join(c for c in description_file_name if c.isalnum() or c.isspace()).rstrip()
            description_file_path = os.path.join(folder_path, description_file_name)
        
            with open(new_filtered_data_file_path, 'r', encoding="utf-8") as file:
               filtered_data = file.readlines()[:6]
        
        # Write the first 6 lines to a separate text file
            with open(description_file_path, 'w') as file:
               file.writelines(filtered_data)
        
        # Delete the filtered data file
            os.remove(new_filtered_data_file_path)
    
        else:
            print(f"Filtered data file {filtered_data_file_name} not found for {synonym}")