In [11]:
import requests
from bs4 import BeautifulSoup
import json
import re
import csv

In [12]:

url = "https://cymitquimica.com/categories/1828/nicotine-and-nicotine-derivatives/?srsltid=AfmBOor5CHkEY17td7i8alPNqfsjPX-VKsd6igxeoJFVzukYf576WD9_"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

json_ld = None
for script in soup.find_all('script', type='application/ld+json'):
    try:
        json_ld = json.loads(script.string)
        break
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        continue

if json_ld:
    if isinstance(json_ld, dict):
        print("json_ld is a dictionary")
        print(json.dumps(json_ld, indent=2))
    else:
        print(f"Unexpected JSON-LD structure: {type(json_ld)}")
else:
    print("No valid JSON-LD found")


json_ld is a dictionary
{
  "@context": "http://schema.org",
  "@type": "WebSite",
  "url": "https://cymitquimica.com",
  "dateModified": "2019-02-15T00:00",
  "image": "https://static.cymitquimica.com/public/img/logo-cymit.png",
  "potentialAction": {
    "@type": "SearchAction",
    "target": "https://cymitquimica.com/search/{search_term_string}/",
    "query-input": "required name=search_term_string"
  },
  "sameAs": [
    "https://www.facebook.com/cymitquimica/",
    "https://twitter.com/cymitquimica",
    "https://es.linkedin.com/company/cymit-quimica-s-l-",
    "https://www.instagram.com/cymitquimica/",
    "https://www.pinterest.es/cymit/",
    "https://cymit.tumblr.com/"
  ]
}


In [13]:
response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

product_links = soup.find_all("a", class_="js-product-link")

for link in product_links:
    product_url = link.get("href")
    if product_url:
        print(f"https://example.com{product_url}")


https://example.com/products/IN-DA00DBPO/494-97-3/nornicotine-dl-rg/
https://example.com/products/IN-DA0034OC/485-35-8/15-methano-8h-pyrido12-a15diazocin-8-one-123456-hexahydro-1r5s/
https://example.com/products/IN-DA00IIQD/95091-91-1/3-pyridinecarboxamide-n-methoxy-n-methyl/
https://example.com/products/IN-DA003EIR/6456-44-6/4-iodo-1-methyl-12-dihydropyridine-3-carboxamide/
https://example.com/products/IN-DA003S6S/532-12-7/3-34-dihydro-2h-pyrrol-5-ylpyridine/
https://example.com/products/IN-DA0028CD/20260-53-1/3-pyridinecarbonyl-chloride-hydrochloride-11/
https://example.com/products/IN-DA00BCE9/2743-90-0/rs-anatabine/
https://example.com/products/IN-DA0033CY/609-71-2/2-hydroxynicotinic-acid/
https://example.com/products/IN-DA00EEI3/59288-43-6/5-hydroxy-6-nitropyridine-3-carboxylic-acid/
https://example.com/products/IN-DA00358E/5470-70-2/methyl-6-methylnicotinate/
https://example.com/products/IN-DA0039B2/3562-11-6/nornicotine-2-carboxylic-acid/
https://example.com/products/IN-DA0025JG

In [14]:
def scrape_page(page_number):
    url_new = url + str(page_number)
    response = requests.get(url_new)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        product_links = soup.find_all("a", class_="js-product-link")
        
        for link in product_links:
            product_url = link.get("href")
            if product_url:
                # Extract last part (after the last "/")
                last_part = product_url.strip("/").split("/")[-1]
                product_urls.append(last_part)
    else:
        print(f"Failed to retrieve page {page_number}")

In [15]:
product_urls = []

# Loop through all pages (adjust the range if needed)
for page in range(1, 14):  # Adjust the range as needed
    print(f"Scraping page {page}...")
    scrape_page(page)

# Save the results to a CSV file
csv_filename = "nicotine_derivative_names.csv"
with open(csv_filename, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Nicotine Derivative Name"])  # Write header
    for url in product_urls:
        writer.writerow([url])  # Write each product part

# Print the number of entries in the CSV file
print(f"Number of entries saved in {csv_filename}: {len(product_urls)}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Number of entries saved in nicotine_derivative_names.csv: 260


In [16]:
import cirpy
from pubchempy import get_compounds
import pandas as pd
from tqdm.notebook import tqdm_notebook
import openbabel
from rdkit import Chem
from chemspipy import ChemSpider

In [17]:
def name_to_smiles_pubchem(name):
    try:
        compounds = get_compounds(name, 'name')
        if compounds:
            return compounds[0].canonical_smiles
    except:
        pass
    return None

def name_to_smiles_cirpy(name):
    try:
        smiles = cirpy.resolve(name, 'smiles')
        if smiles:
            return smiles
    except:
        pass
    return None

In [18]:
def name_to_smiles(name):
    smiles = name_to_smiles_pubchem(name)
    if smiles:
        return smiles
    
    smiles = name_to_smiles_cirpy(name)
    if smiles:
        return smiles
    
    return None

In [None]:
df = pd.read_csv('nicotine_derivative_names.csv')
print(df.columns)

tqdm_notebook.pandas()
df['SMILES'] = df['Nicotine Derivative Name'].progress_apply(name_to_smiles)

total = len(df)
converted = df['SMILES'].notna().sum()
print(f"Total compounds: {total}")
print(f"Successfully converted: {converted}")
print(f"Conversion rate: {converted/total:.2%}")

df = df.dropna(subset=['SMILES'])
print(df)

df.to_csv('nicotine_derivatives_with_smiles.csv', index=False)

Index(['Nicotine Derivative Name'], dtype='object')


  0%|          | 0/260 [00:00<?, ?it/s]

                        Nicotine Derivative Name  \
7                        2-hydroxynicotinic-acid   
8    5-hydroxy-6-nitropyridine-3-carboxylic-acid   
9                      methyl-6-methylnicotinate   
10                 nornicotine-2-carboxylic-acid   
13                          6-methylnicotinamide   
..                                           ...   
249                    methyl-6-methylnicotinate   
250                nornicotine-2-carboxylic-acid   
253                         6-methylnicotinamide   
255                      cp-809101-hydrochloride   
259                        3-pyridinecarboxamide   

                                          SMILES  
7                         C1=CNC(=O)C(=C1)C(=O)O  
8                OC(=O)c1cnc(c(O)c1)[N+]([O-])=O  
9                          CC1=NC=C(C=C1)C(=O)OC  
10                    C1CC(NC1C2=CN=CC=C2)C(=O)O  
13                          CC1=NC=C(C=C1)C(=O)N  
..                                           ...  
249               