In [43]:
import requests
from bs4 import BeautifulSoup
import json
import re
import csv
import pandas as pd

In [44]:

url = "https://cymitquimica.com/categories/1828/nicotine-and-nicotine-derivatives/?srsltid=AfmBOor5CHkEY17td7i8alPNqfsjPX-VKsd6igxeoJFVzukYf576WD9_&page="
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

json_ld = None
for script in soup.find_all('script', type='application/ld+json'):
    try:
        json_ld = json.loads(script.string)
        break
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        continue

if json_ld:
    if isinstance(json_ld, dict):
        print("json_ld is a dictionary")
        print(json.dumps(json_ld, indent=2))
    else:
        print(f"Unexpected JSON-LD structure: {type(json_ld)}")
else:
    print("No valid JSON-LD found")


json_ld is a dictionary
{
  "@context": "http://schema.org",
  "@type": "WebSite",
  "url": "https://cymitquimica.com",
  "dateModified": "2019-02-15T00:00",
  "image": "https://static.cymitquimica.com/public/img/logo-cymit.png",
  "potentialAction": {
    "@type": "SearchAction",
    "target": "https://cymitquimica.com/search/{search_term_string}/",
    "query-input": "required name=search_term_string"
  },
  "sameAs": [
    "https://www.facebook.com/cymitquimica/",
    "https://twitter.com/cymitquimica",
    "https://es.linkedin.com/company/cymit-quimica-s-l-",
    "https://www.instagram.com/cymitquimica/",
    "https://www.pinterest.es/cymit/",
    "https://cymit.tumblr.com/"
  ]
}


In [45]:
response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

product_links = soup.find_all("a", class_="js-product-link")

for link in product_links:
    product_url = link.get("href")
    if product_url:
        print(f"https://example.com{product_url}")


In [46]:
all_cas_numbers = []

for page in range(1, 14):
    urln = url + str(page)
    response = requests.get(urln)
    soup = BeautifulSoup(response.content, 'html.parser')

    for a_tag in soup.find_all('a', href=True):
        if "/cas/" in a_tag['href']:
            cas_number = a_tag.text.strip()
            if cas_number != "CAS list": print(cas_number)
            all_cas_numbers.append(cas_number)

df = pd.DataFrame(all_cas_numbers, columns = ["cas"])
unique_cas = df["cas"].drop_duplicates()
unique_cas_cleaned = unique_cas[~unique_cas.str.contains(r'[a-zA-Z]', regex=True)]

unique_cas_df = pd.DataFrame(unique_cas_cleaned, columns=["cas"])

total_original = len(df)
duplicates_removed = total_original - len(unique_cas)
invalid_rows_removed = len(unique_cas) - len(unique_cas_cleaned)

print(f"Total CAS Numbers: {total_original}")
print(f"Duplicates Removed: {duplicates_removed}")
print(f"Invalid Rows Removed (with letters): {invalid_rows_removed}")
print(f"Final Unique CAS Numbers: {len(unique_cas_cleaned)}")

494-97-3
485-35-8
95091-91-1
6456-44-6
532-12-7
20260-53-1
2743-90-0
609-71-2
59288-43-6
5470-70-2
3562-11-6
177785-14-7
29681-45-6
6960-22-1
1802-30-8
1215721-40-6
132334-98-6
89690-09-5
494-52-0
98-92-0
494-97-3
485-35-8
95091-91-1
6456-44-6
532-12-7
20260-53-1
2743-90-0
609-71-2
59288-43-6
5470-70-2
3562-11-6
177785-14-7
29681-45-6
6960-22-1
1802-30-8
1215721-40-6
132334-98-6
89690-09-5
494-52-0
98-92-0
6197-39-3
5398-44-7
61445-55-4
5006-66-6
5746-86-1
38496-18-3
39178-35-3
129747-52-0
38806-38-1
64091-91-4
486-56-6
1094-61-7
27828-71-3
78348-28-4
66093-90-1
1364663-27-3
13190-97-1
3222-56-8
1986-81-8
90872-72-3
6197-39-3
5398-44-7
61445-55-4
5006-66-6
5746-86-1
38496-18-3
39178-35-3
129747-52-0
38806-38-1
64091-91-4
486-56-6
1094-61-7
27828-71-3
78348-28-4
66093-90-1
1364663-27-3
13190-97-1
3222-56-8
1986-81-8
90872-72-3
59578-62-0
2047-49-6
6311-35-9
499-81-0
7076-23-5
614-00-6
2004-06-0
27247-34-3
3569-99-1
3222-47-7
321-02-8
13078-04-1
1314217-69-0
4314-66-3
5176-27-2
924-16-3


In [47]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
from tqdm import tqdm
import time
from selenium.webdriver.chrome.options import Options
import pubchempy as pcp

options = Options()
options.add_argument("--headless")  # Run in headless mode for speed
options.add_argument("--disable-gpu")  # Avoid GPU rendering issues
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)

# Set a page load timeout (e.g., 180 seconds)
driver.set_page_load_timeout(180)

In [48]:
def get_smiles_from_cas(cas_number):
    print(cas_number)
    try:
        compound = pcp.get_compounds(f'{cas_number}', 'name')[0]
        smiles = compound.isomeric_smiles
        print(f"SMILES for {cas_number}: {smiles}")
        return smiles
    except IndexError:
        print(f"No compound found for CAS: {cas_number}")
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [49]:
df = unique_cas_df

tqdm.pandas()
df['SMILES'] = df["cas"].progress_apply(get_smiles_from_cas)

total = len(df)
converted = df['SMILES'].notna().sum()
print(f"Total compounds: {total}")
print(f"Successfully converted: {converted}")
print(f"Conversion rate: {converted/total:.2%}")

df = df.dropna(subset=['SMILES'])
print(df)

df.to_csv('Unique_nicotine_derivatives_with_smiles.csv', index=False)

  0%|          | 0/172 [00:00<?, ?it/s]

494-97-3


  1%|          | 2/172 [00:00<00:39,  4.34it/s]

SMILES for 494-97-3: C1C[C@H](NC1)C2=CN=CC=C2
485-35-8


  2%|▏         | 3/172 [00:00<00:49,  3.43it/s]

SMILES for 485-35-8: C1[C@H]2CNC[C@@H]1C3=CC=CC(=O)N3C2
95091-91-1


  2%|▏         | 4/172 [00:01<01:02,  2.67it/s]

SMILES for 95091-91-1: CN(C(=O)C1=CN=CC=C1)OC
6456-44-6


  3%|▎         | 5/172 [00:01<01:06,  2.50it/s]

SMILES for 6456-44-6: C[N+]1=CC=CC(=C1)C(=O)N.[I-]
532-12-7


  3%|▎         | 6/172 [00:02<01:03,  2.60it/s]

SMILES for 532-12-7: C1CC(=NC1)C2=CN=CC=C2
20260-53-1


  4%|▍         | 7/172 [00:02<01:11,  2.31it/s]

SMILES for 20260-53-1: C1=CC(=CN=C1)C(=O)Cl.Cl
2743-90-0


  5%|▍         | 8/172 [00:03<01:09,  2.35it/s]

SMILES for 2743-90-0: C1C=CCNC1C2=CN=CC=C2
609-71-2


  5%|▌         | 9/172 [00:03<01:06,  2.44it/s]

SMILES for 609-71-2: C1=CNC(=O)C(=C1)C(=O)O
59288-43-6


  6%|▌         | 10/172 [00:03<01:10,  2.31it/s]

SMILES for 59288-43-6: C1=C(C=NC(=C1O)[N+](=O)[O-])C(=O)O
5470-70-2


  6%|▋         | 11/172 [00:04<01:07,  2.40it/s]

SMILES for 5470-70-2: CC1=NC=C(C=C1)C(=O)OC
3562-11-6


  7%|▋         | 12/172 [00:04<01:07,  2.37it/s]

SMILES for 3562-11-6: C1CC(NC1C2=CN=CC=C2)C(=O)O
177785-14-7


  8%|▊         | 13/172 [00:05<01:11,  2.22it/s]

SMILES for 177785-14-7: COC(=O)C1=C(N=CC=C1)CCl
29681-45-6


  8%|▊         | 14/172 [00:05<01:08,  2.30it/s]

SMILES for 29681-45-6: CC1=CC(=CN=C1)C(=O)OC
6960-22-1


  9%|▊         | 15/172 [00:06<01:07,  2.33it/s]

SMILES for 6960-22-1: CC1=NC=C(C=C1)C(=O)N
1802-30-8


  9%|▉         | 16/172 [00:06<01:05,  2.39it/s]

SMILES for 1802-30-8: C1=CC(=NC=C1C(=O)O)C2=NC=C(C=C2)C(=O)O
1215721-40-6


 10%|▉         | 17/172 [00:07<01:10,  2.21it/s]

SMILES for 1215721-40-6: C1CN(CCN1)C2=CN=CC(=N2)OCC3=CC(=CC=C3)Cl.Cl
132334-98-6


 10%|█         | 18/172 [00:07<01:13,  2.08it/s]

SMILES for 132334-98-6: CCOC(=O)C1=CN=C(C=C1)Br
89690-09-5


 11%|█         | 19/172 [00:08<01:12,  2.10it/s]

SMILES for 89690-09-5: CNCCCCN.Cl.Cl
494-52-0


 12%|█▏        | 20/172 [00:08<01:08,  2.22it/s]

SMILES for 494-52-0: C1CCN[C@@H](C1)C2=CN=CC=C2
98-92-0


 12%|█▏        | 21/172 [00:08<01:04,  2.36it/s]

SMILES for 98-92-0: C1=CC(=CN=C1)C(=O)N
6197-39-3


 13%|█▎        | 22/172 [00:09<01:06,  2.26it/s]

SMILES for 6197-39-3: COC(=O)C1=CCCNC1.Cl
5398-44-7


 13%|█▎        | 23/172 [00:09<01:04,  2.30it/s]

SMILES for 5398-44-7: C1=C(C=C(N=C1Cl)Cl)C(=O)O
61445-55-4


 14%|█▍        | 24/172 [00:10<01:02,  2.38it/s]

SMILES for 61445-55-4: CN(CCCC(=O)O)N=O
5006-66-6


 15%|█▍        | 25/172 [00:10<01:17,  1.90it/s]

SMILES for 5006-66-6: C1=CC(=O)NC=C1C(=O)O
5746-86-1


 15%|█▌        | 26/172 [00:11<01:09,  2.09it/s]

SMILES for 5746-86-1: C1CC(NC1)C2=CN=CC=C2
38496-18-3


 16%|█▌        | 27/172 [00:11<01:04,  2.25it/s]

SMILES for 38496-18-3: C1=CC(=NC(=C1C(=O)O)Cl)Cl
39178-35-3


 16%|█▋        | 28/172 [00:11<01:02,  2.30it/s]

SMILES for 39178-35-3: C1=CN=CC=C1C(=O)Cl.Cl
129747-52-0


 17%|█▋        | 29/172 [00:12<01:03,  2.26it/s]

SMILES for 129747-52-0: COC(=O)C1=CN=CC(=C1)CO
38806-38-1


 17%|█▋        | 30/172 [00:12<01:02,  2.28it/s]

SMILES for 38806-38-1: C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN6C5=NC=C6)O)O)O)O)C(=O)N
64091-91-4


 18%|█▊        | 31/172 [00:13<01:02,  2.25it/s]

SMILES for 64091-91-4: CN(CCCC(=O)C1=CN=CC=C1)N=O
486-56-6


 19%|█▊        | 32/172 [00:13<01:00,  2.33it/s]

SMILES for 486-56-6: CN1[C@@H](CCC1=O)C2=CN=CC=C2
1094-61-7


 19%|█▉        | 33/172 [00:14<01:02,  2.23it/s]

SMILES for 1094-61-7: C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)[O-])O)O)C(=O)N
27828-71-3


 20%|█▉        | 34/172 [00:14<01:00,  2.29it/s]

SMILES for 27828-71-3: C1=C(C=NC=C1O)C(=O)O
78348-28-4


 20%|██        | 35/172 [00:15<00:57,  2.38it/s]

SMILES for 78348-28-4: C1CC(=O)N(C1=O)OC(=O)C2=CN=CC=C2
66093-90-1


 21%|██        | 36/172 [00:15<00:54,  2.50it/s]

SMILES for 66093-90-1: CNCCCC(=O)C1=CN=CC=C1.Cl.Cl
1364663-27-3


 22%|██▏       | 37/172 [00:15<00:56,  2.38it/s]

SMILES for 1364663-27-3: COC(=O)C1=CN=C(C=C1Br)Br
13190-97-1


 22%|██▏       | 38/172 [00:16<00:57,  2.33it/s]

SMILES for 13190-97-1: CC(=CCC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CO)/C)/C)/C)/C)/C)/C)/C)/C)C
3222-56-8


 23%|██▎       | 39/172 [00:16<00:59,  2.24it/s]

SMILES for 3222-56-8: CC1=C(C=CC=N1)C(=O)O
1986-81-8


 23%|██▎       | 40/172 [00:17<00:55,  2.39it/s]

SMILES for 1986-81-8: C1=CC(=C[N+](=C1)[O-])C(=O)N
90872-72-3


 24%|██▍       | 41/172 [00:17<00:58,  2.23it/s]

SMILES for 90872-72-3: CC1=NC=C(C=C1)C2CCCN2
59578-62-0


 24%|██▍       | 42/172 [00:18<00:59,  2.17it/s]

SMILES for 59578-62-0: C1=CC(=CN=C1)C(=O)CCCO
2047-49-6


 25%|██▌       | 43/172 [00:18<01:00,  2.12it/s]

SMILES for 2047-49-6: C1=C(C=NC=C1[N+](=O)[O-])C(=O)O
6311-35-9


 26%|██▌       | 44/172 [00:19<01:01,  2.09it/s]

SMILES for 6311-35-9: C1=CC(=NC=C1C(=O)O)Br
499-81-0


 26%|██▌       | 45/172 [00:19<01:11,  1.77it/s]

SMILES for 499-81-0: C1=C(C=NC=C1C(=O)O)C(=O)O
7076-23-5


 27%|██▋       | 46/172 [00:20<01:06,  1.90it/s]

SMILES for 7076-23-5: C1C[C@@H](NC1)C2=CN=CC=C2
614-00-6


 27%|██▋       | 47/172 [00:20<00:59,  2.08it/s]

SMILES for 614-00-6: CN(C1=CC=CC=C1)N=O
2004-06-0


 28%|██▊       | 48/172 [00:21<01:06,  1.87it/s]

SMILES for 2004-06-0: C1=NC2=C(C(=N1)Cl)N=CN2[C@H]3[C@@H]([C@@H]([C@H](O3)CO)O)O
27247-34-3


 28%|██▊       | 49/172 [00:22<01:11,  1.73it/s]

SMILES for 27247-34-3: COC(=O)C1=CN=CC(=C1)C(=O)[O-].[K+]
3569-99-1


 29%|██▉       | 50/172 [00:22<01:03,  1.92it/s]

SMILES for 3569-99-1: C1=CC(=CN=C1)C(=O)NCO
3222-47-7


 30%|██▉       | 51/172 [00:22<00:59,  2.03it/s]

SMILES for 3222-47-7: CC1=NC=C(C=C1)C(=O)O
321-02-8


 30%|███       | 52/172 [00:23<01:00,  1.98it/s]

SMILES for 321-02-8: C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O)O)O)C(=O)[O-]
13078-04-1


 31%|███       | 53/172 [00:23<00:55,  2.13it/s]

SMILES for 13078-04-1: C1CCNC(C1)C2=CN=CC=C2
1314217-69-0


 31%|███▏      | 54/172 [00:24<00:55,  2.14it/s]

SMILES for 1314217-69-0: CC1=CC(=CC2=C1C(=O)N(C2)CC3=CC=C(C=C3)OC(F)(F)F)C4=NC(=NO4)CN5CCNCC5.CS(=O)(=O)O
4314-66-3


 32%|███▏      | 55/172 [00:24<00:52,  2.25it/s]

SMILES for 4314-66-3: CCNC(=O)C1=CN=CC=C1
5176-27-2


 33%|███▎      | 56/172 [00:25<00:49,  2.33it/s]

SMILES for 5176-27-2: CC(C)(C)OC(=O)N1C=CC=C1
924-16-3


 33%|███▎      | 57/172 [00:25<00:48,  2.37it/s]

SMILES for 924-16-3: CCCCN(CCCC)N=O
1207384-47-1


 34%|███▎      | 58/172 [00:25<00:48,  2.36it/s]

SMILES for 1207384-47-1: [2H]C([2H])([2H])N1C=CC(=O)C(=C1)C(=O)N
51095-86-4


 34%|███▍      | 59/172 [00:26<00:48,  2.33it/s]

SMILES for 51095-86-4: C[N@@+]1(CCC[C@H]1C2=CN=CC=C2)[O-]
51020-67-8


 35%|███▍      | 60/172 [00:26<00:46,  2.42it/s]

SMILES for 51020-67-8: C[N@+]1(CCC[C@H]1C2=CN=CC=C2)[O-]
1216737-36-8


 35%|███▌      | 61/172 [00:27<00:47,  2.33it/s]

SMILES for 1216737-36-8: [2H]C1=C(C(=C(N=C1[2H])[2H])C(=O)NCC(=O)O)[2H]
5654-86-4


 36%|███▌      | 62/172 [00:27<00:48,  2.29it/s]

SMILES for 5654-86-4: CC(C)CC1C(=O)N2CCCC2C(=O)N1
491-26-9


 37%|███▋      | 63/172 [00:28<00:46,  2.34it/s]

SMILES for 491-26-9: C[N+]1(CCC[C@H]1C2=CN=CC=C2)[O-]
63551-14-4


 37%|███▋      | 64/172 [00:28<00:43,  2.47it/s]

SMILES for 63551-14-4: C[N+]1(CCCC1C2=CN=CC=C2)[O-]
3612-80-4


 38%|███▊      | 65/172 [00:28<00:44,  2.43it/s]

SMILES for 3612-80-4: C1=CC(=CN=C1)C(=O)OCCO
98491-81-7


 38%|███▊      | 66/172 [00:29<00:42,  2.48it/s]

SMILES for 98491-81-7: C1=CC(=CN=C1)C(=O)OCCO.Cl
54-11-5


 39%|███▉      | 67/172 [00:29<00:43,  2.40it/s]

SMILES for 54-11-5: CN1CCC[C@H]1C2=CN=CC=C2
3719-45-7


 40%|███▉      | 68/172 [00:29<00:42,  2.47it/s]

SMILES for 3719-45-7: CN1C=C(C=CC1=O)C(=O)O
701-44-0


 40%|████      | 69/172 [00:30<00:41,  2.49it/s]

SMILES for 701-44-0: CN1C=C(C=CC1=O)C(=O)N
66148-19-4


 41%|████      | 70/172 [00:30<00:40,  2.50it/s]

SMILES for 66148-19-4: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCCN2N=O)[2H]
1219805-86-3


 41%|████▏     | 71/172 [00:31<00:39,  2.54it/s]

SMILES for 1219805-86-3: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCCN2C([2H])([2H])[2H])[2H]
69980-24-1


 42%|████▏     | 72/172 [00:31<00:40,  2.48it/s]

SMILES for 69980-24-1: [2H]C([2H])([2H])N1CCCC1C2=CN=CC=C2
501-81-5


 42%|████▏     | 73/172 [00:31<00:39,  2.52it/s]

SMILES for 501-81-5: C1=CC(=CN=C1)CC(=O)O
6419-36-9


 43%|████▎     | 74/172 [00:32<00:39,  2.51it/s]

SMILES for 6419-36-9: C1=CC(=CN=C1)CC(=O)O.Cl
769-49-3


 44%|████▎     | 75/172 [00:32<00:38,  2.51it/s]

SMILES for 769-49-3: CN1C=CC(=O)C(=C1)C(=O)N
1207384-48-2


 44%|████▍     | 76/172 [00:33<00:38,  2.46it/s]

SMILES for 1207384-48-2: [2H]C([2H])([2H])N1C=C(C=CC1=O)C(=O)N
350818-69-8


 45%|████▍     | 77/172 [00:33<00:37,  2.52it/s]

SMILES for 350818-69-8: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCCN2C)[2H]
59-67-6


 45%|████▌     | 78/172 [00:33<00:36,  2.56it/s]

SMILES for 59-67-6: C1=CC(=CN=C1)C(=O)O
487-19-4


 46%|████▌     | 79/172 [00:34<00:35,  2.60it/s]

SMILES for 487-19-4: CN1C=CC=C1C2=CN=CC=C2
66148-15-0


 47%|████▋     | 80/172 [00:34<00:36,  2.52it/s]

SMILES for 66148-15-0: [2H]C1=C(C(=C(N=C1[2H])[2H])C(=O)O)[2H]
94-44-0


 47%|████▋     | 81/172 [00:35<00:37,  2.45it/s]

SMILES for 94-44-0: C1=CC=C(C=C1)COC(=O)C2=CN=CC=C2
65550-28-9


 48%|████▊     | 82/172 [00:35<00:34,  2.62it/s]

No compound found for CAS: 65550-28-9
1020719-11-2


 48%|████▊     | 83/172 [00:35<00:36,  2.44it/s]

SMILES for 1020719-11-2: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CC=CCN2)[2H]
153536-53-9


 49%|████▉     | 84/172 [00:36<00:36,  2.44it/s]

SMILES for 153536-53-9: CN1CCC[C@H]1C2=C[N+](=CC=C2)[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)[O-])O)O)O
138946-42-6


 49%|████▉     | 85/172 [00:36<00:36,  2.35it/s]

SMILES for 138946-42-6: [2H]C1=C(C(=C(N=C1[2H])C(=O)O)C(=O)O)[2H]
887355-56-8


 50%|█████     | 86/172 [00:37<00:36,  2.38it/s]

SMILES for 887355-56-8: C1=CC(=CN=C1)C(=O)CCCNC=O
25162-00-9


 51%|█████     | 87/172 [00:37<00:35,  2.39it/s]

SMILES for 25162-00-9: CN1CCC[C@@H]1C2=CN=CC=C2
125630-26-4


 51%|█████     | 88/172 [00:38<00:35,  2.34it/s]

SMILES for 125630-26-4: CN1CCC(C1)C(=O)C2=CN=CC=C2
494-98-4


 52%|█████▏    | 89/172 [00:38<00:34,  2.44it/s]

SMILES for 494-98-4: C1=CC(=CN=C1)C2=CC=CN2
16543-55-8


 52%|█████▏    | 90/172 [00:38<00:32,  2.51it/s]

SMILES for 16543-55-8: C1C[C@H](N(C1)N=O)C2=CN=CC=C2
2055-29-0


 53%|█████▎    | 91/172 [00:39<00:30,  2.64it/s]

No compound found for CAS: 2055-29-0
17708-87-1


 53%|█████▎    | 92/172 [00:39<00:31,  2.54it/s]

SMILES for 17708-87-1: C1CC(=O)NC1C2=CN=CC=C2
73057-36-0


 54%|█████▍    | 93/172 [00:40<00:30,  2.56it/s]

No compound found for CAS: 73057-36-0
1824020-12-3


 55%|█████▍    | 94/172 [00:40<00:30,  2.59it/s]

SMILES for 1824020-12-3: CN1CCCC1(C2=CN=CC=C2)O
1060802-34-7


 55%|█████▌    | 95/172 [00:40<00:29,  2.63it/s]

SMILES for 1060802-34-7: C1=C(C(=C(C=N1)F)Cl)C=O
535-83-1


 56%|█████▌    | 96/172 [00:41<00:28,  2.68it/s]

SMILES for 535-83-1: C[N+]1=CC=CC(=C1)C(=O)[O-]
6138-41-6


 56%|█████▋    | 97/172 [00:41<00:28,  2.66it/s]

SMILES for 6138-41-6: C[N+]1=CC=CC(=C1)C(=O)O.[Cl-]
320386-54-7


 57%|█████▋    | 98/172 [00:41<00:29,  2.54it/s]

SMILES for 320386-54-7: C1=CC(=C(N=C1)C(=O)O)S.Cl
92761-98-3


 58%|█████▊    | 99/172 [00:42<00:29,  2.45it/s]

SMILES for 92761-98-3: [2H]C1=C(C(=C(N=C1[2H])[2H])[C@@H]2CCCN2)[2H]
15268-31-2


 58%|█████▊    | 100/172 [00:42<00:29,  2.48it/s]

SMILES for 15268-31-2: C1=CC(=CN=C1)N=C=O
1393569-52-2


 59%|█████▊    | 101/172 [00:43<00:27,  2.54it/s]

SMILES for 1393569-52-2: C1=C(C=C(N=C1C(=O)O)[N+](=O)[O-])Cl
29790-52-1


 59%|█████▉    | 102/172 [00:43<00:27,  2.54it/s]

SMILES for 29790-52-1: CN1CCC[C@H]1C2=CN=CC=C2.C1=CC=C(C(=C1)C(=O)O)O
60138-76-3


 60%|█████▉    | 103/172 [00:43<00:28,  2.45it/s]

SMILES for 60138-76-3: CN(C)C1=C(C=CC=N1)C#N
2873-36-1


 60%|██████    | 104/172 [00:44<00:34,  1.96it/s]

SMILES for 2873-36-1: CC(C)C[C@H]1C(=O)N2CCC[C@H]2C(=O)N1
104809-30-5


 61%|██████    | 105/172 [00:45<00:33,  1.99it/s]

SMILES for 104809-30-5: C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])OP(=O)([O-])OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)O.[Na+]
339155-13-4


 62%|██████▏   | 106/172 [00:45<00:30,  2.14it/s]

SMILES for 339155-13-4: C1=CC(=C(N=C1)C(=O)O)C(=O)O
525-74-6


 62%|██████▏   | 107/172 [00:46<00:30,  2.13it/s]

SMILES for 525-74-6: CN1CCC=C1C2=CN=CC=C2
75043-32-2


 63%|██████▎   | 108/172 [00:46<00:28,  2.27it/s]

No compound found for CAS: 75043-32-2
3705-26-8


 63%|██████▎   | 109/172 [00:46<00:26,  2.38it/s]

SMILES for 3705-26-8: C1C[C@H]2C(=O)N[C@H](C(=O)N2C1)CC3=CC=CC=C3
39642-60-9


 64%|██████▍   | 110/172 [00:47<00:27,  2.28it/s]

SMILES for 39642-60-9: C1=CC(=CN=C1)NC(=O)NC2=CN=CC=C2
3222-49-9


 65%|██████▍   | 111/172 [00:47<00:25,  2.38it/s]

SMILES for 3222-49-9: CC1=CC(=CN=C1)C(=O)O
75195-76-5


 65%|██████▌   | 112/172 [00:48<00:24,  2.45it/s]

SMILES for 75195-76-5: C1C[C@H](N(C1)N=O)C2=C[N+](=CC=C2)[O-]
21446-46-8


 66%|██████▌   | 113/172 [00:48<00:25,  2.32it/s]

SMILES for 21446-46-8: CN1CCC[C@H]1C2=C[N+](=CC=C2)C.[I-]
6019-06-3


 66%|██████▋   | 114/172 [00:48<00:25,  2.30it/s]

SMILES for 6019-06-3: CN1CCC[C@H]1C2=CN=CC=C2.[C@@H]([C@H](C(=O)O)O)(C(=O)O)O.[C@@H]([C@H](C(=O)O)O)(C(=O)O)O.O.O
65-31-6


 67%|██████▋   | 115/172 [00:49<00:25,  2.21it/s]

SMILES for 65-31-6: CN1CCC[C@H]1C2=CN=CC=C2.[C@@H]([C@H](C(=O)O)O)(C(=O)O)O.[C@@H]([C@H](C(=O)O)O)(C(=O)O)O
942922-74-9


 67%|██████▋   | 116/172 [00:49<00:25,  2.16it/s]

SMILES for 942922-74-9: COC(=O)C1=NC=CC(=C1)C2=CC(=NC=C2)C(=O)OC
66148-18-3


 68%|██████▊   | 117/172 [00:50<00:23,  2.33it/s]

SMILES for 66148-18-3: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCCN2)[2H]
2854-40-2


 69%|██████▊   | 118/172 [00:50<00:21,  2.47it/s]

SMILES for 2854-40-2: CC(C)[C@H]1C(=O)N2CCC[C@H]2C(=O)N1
88660-53-1


 69%|██████▉   | 119/172 [00:51<00:22,  2.32it/s]

SMILES for 88660-53-1: CN1CCC[C@H]1C2=CN=CC=C2.C1=CC=C(C=C1)C(=O)O
870975-59-0


 70%|██████▉   | 120/172 [00:51<00:20,  2.52it/s]

No compound found for CAS: 870975-59-0
23950-04-1


 70%|███████   | 121/172 [00:51<00:20,  2.53it/s]

SMILES for 23950-04-1: CN1CCCC1C2=CC=CC=N2
98-98-6


 71%|███████   | 122/172 [00:52<00:19,  2.58it/s]

SMILES for 98-98-6: C1=CC=NC(=C1)C(=O)O
102074-19-1


 72%|███████▏  | 123/172 [00:52<00:20,  2.44it/s]

SMILES for 102074-19-1: CC1=CC(=CN=C1)CO
1657-32-5


 72%|███████▏  | 124/172 [00:53<00:19,  2.47it/s]

SMILES for 1657-32-5: C1=CC(=C[N+](=C1)[O-])N
1426174-36-8


 73%|███████▎  | 125/172 [00:53<00:18,  2.56it/s]

SMILES for 1426174-36-8: [2H]C1=C(C(=C(N=C1[2H])[2H])[C@@H]2CCCN2N=O)[2H]
129547-84-8


 73%|███████▎  | 126/172 [00:53<00:17,  2.63it/s]

SMILES for 129547-84-8: C[N+]1(CCC[C@H]1C2=C[N+](=CC=C2)[O-])[O-]
2055-23-4


 74%|███████▍  | 127/172 [00:54<00:16,  2.66it/s]

SMILES for 2055-23-4: CNCCCC(=O)C1=CN=CC=C1
591-22-0


 74%|███████▍  | 128/172 [00:54<00:16,  2.63it/s]

SMILES for 591-22-0: CC1=CC(=CN=C1)C
347841-88-7


 75%|███████▌  | 129/172 [00:54<00:17,  2.48it/s]

SMILES for 347841-88-7: [2H]C1=C(C(=C(N=C1[2H])[2H])C(=O)N)[2H]
1346601-08-8


 76%|███████▌  | 130/172 [00:55<00:16,  2.54it/s]

SMILES for 1346601-08-8: [2H]C([2H])([2H])N1CCC(C1)C2=CN=CC=C2
71267-22-6


 76%|███████▌  | 131/172 [00:55<00:16,  2.45it/s]

SMILES for 71267-22-6: C1C=CCN([C@@H]1C2=CN=CC=C2)N=O
1020719-70-3


 77%|███████▋  | 132/172 [00:56<00:16,  2.41it/s]

SMILES for 1020719-70-3: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCC(=O)N2)[2H]
1469367-99-4


 77%|███████▋  | 133/172 [00:56<00:15,  2.49it/s]

No compound found for CAS: 1469367-99-4
494-04-2


 78%|███████▊  | 134/172 [00:57<00:15,  2.46it/s]

SMILES for 494-04-2: C1=CC(=CN=C1)C2=CC(=NC=C2)C3=CN=CC=C3
114-33-0


 78%|███████▊  | 135/172 [00:57<00:14,  2.56it/s]

SMILES for 114-33-0: CNC(=O)C1=CN=CC=C1
34834-67-8


 79%|███████▉  | 136/172 [00:57<00:14,  2.48it/s]

SMILES for 34834-67-8: CN1[C@@H](C[C@H](C1=O)O)C2=CN=CC=C2
15569-85-4


 80%|███████▉  | 137/172 [00:58<00:14,  2.43it/s]

SMILES for 15569-85-4: CN1C(CCC1=O)C2=CN=CC=C2
66148-17-2


 80%|████████  | 138/172 [00:58<00:14,  2.30it/s]

SMILES for 66148-17-2: [2H]C1=C(C(=C(N=C1[2H])[2H])C2=NCCC2)[2H]
139427-57-9


 81%|████████  | 139/172 [00:59<00:13,  2.39it/s]

SMILES for 139427-57-9: CN1[C@@H](CCC1=O)C2=C[N+](=CC=C2)[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)[O-])O)O)O
53-84-9


 81%|████████▏ | 140/172 [00:59<00:14,  2.17it/s]

SMILES for 53-84-9: C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)N
34366-21-7


 82%|████████▏ | 141/172 [01:00<00:13,  2.25it/s]

SMILES for 34366-21-7: C1CCN[C@H](C1)C2=CN=CC=C2
764661-23-6


 83%|████████▎ | 142/172 [01:00<00:12,  2.34it/s]

SMILES for 764661-23-6: [2H]C1=C(C(=C(N=C1[2H])[2H])C(=O)CCCNC)[2H]
146275-18-5


 83%|████████▎ | 143/172 [01:00<00:12,  2.41it/s]

SMILES for 146275-18-5: CN1[C@@H](C[C@H](C1=O)O)C2=C[N+](=CC=C2)[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)[O-])O)O)O
159956-78-2


 84%|████████▎ | 144/172 [01:01<00:11,  2.40it/s]

SMILES for 159956-78-2: [2H]C([2H])([2H])N1[C@@H](C[C@H](C1=O)O)C2=CN=CC=C2
909014-86-4


 84%|████████▍ | 145/172 [01:01<00:11,  2.31it/s]

SMILES for 909014-86-4: [2H][13C]([2H])([2H])N1CCCC1C2=CN=CC=C2
110952-70-0


 85%|████████▍ | 146/172 [01:02<00:11,  2.27it/s]

SMILES for 110952-70-0: [2H]C([2H])([2H])N1C(CCC1=O)C2=CN=CC=C2
1020719-08-7


 85%|████████▌ | 147/172 [01:02<00:10,  2.27it/s]

SMILES for 1020719-08-7: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCCCN2)[2H]
1040920-61-3


 86%|████████▌ | 148/172 [01:02<00:09,  2.41it/s]

No compound found for CAS: 1040920-61-3
1020719-68-9


 87%|████████▋ | 149/172 [01:03<00:09,  2.42it/s]

SMILES for 1020719-68-9: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCCCN2N=O)[2H]
284685-07-0


 87%|████████▋ | 150/172 [01:03<00:09,  2.44it/s]

SMILES for 284685-07-0: [2H]C1=C(C(=C(N=C1[2H])[2H])[C@@H]2CCCN2C)[2H]
581-49-7


 88%|████████▊ | 151/172 [01:04<00:08,  2.54it/s]

SMILES for 581-49-7: C1C=CCN[C@@H]1C2=CN=CC=C2
1246819-72-6


 88%|████████▊ | 152/172 [01:04<00:07,  2.64it/s]

SMILES for 1246819-72-6: CNCCC[13C](=O)[13C]1=[13CH]N=[13CH][13CH]=[13CH]1
1215842-75-3


 89%|████████▉ | 153/172 [01:04<00:07,  2.65it/s]

SMILES for 1215842-75-3: [2H][13C]([2H])([2H])N1C(CCC1=O)C2=CN=CC=C2
1189727-40-9


 90%|████████▉ | 154/172 [01:05<00:07,  2.48it/s]

SMILES for 1189727-40-9: [2H]C([2H])([2H])NCCCC(=O)C1=CN=CC=C1
350818-68-7


 90%|█████████ | 155/172 [01:06<00:08,  2.05it/s]

SMILES for 350818-68-7: [2H]C1=C(C(=C(N=C1[2H])[2H])C2CCC(=O)N2C)[2H]
871894-35-8


 91%|█████████ | 156/172 [01:06<00:08,  1.78it/s]

SMILES for 871894-35-8: CN1CCCC1C2=C(N=CC=C2)Cl
132929-88-5


 91%|█████████▏| 157/172 [01:07<00:08,  1.75it/s]

SMILES for 132929-88-5: CN1[C@@H](C[C@H](C1=O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)C(=O)O)O)O)O)C3=CN=CC=C3
581-50-0


 92%|█████████▏| 158/172 [01:07<00:07,  1.88it/s]

SMILES for 581-50-0: C1=CC=NC(=C1)C2=CN=CC=C2
764661-24-7


 92%|█████████▏| 159/172 [01:08<00:06,  1.98it/s]

SMILES for 764661-24-7: [2H]C1=C(C(=C(N=C1[2H])[2H])C(=O)CCCN(C)N=O)[2H]
80508-23-2


 93%|█████████▎| 160/172 [01:08<00:05,  2.13it/s]

SMILES for 80508-23-2: C1CC(N(C1)N=O)C2=CN=CC=C2
887407-16-1


 94%|█████████▎| 161/172 [01:09<00:04,  2.22it/s]

SMILES for 887407-16-1: C1C=CCN(C1C2=CN=CC=C2)N=O
612-64-6


 94%|█████████▍| 162/172 [01:09<00:04,  2.34it/s]

SMILES for 612-64-6: CCN(C1=CC=CC=C1)N=O
918625-36-2


 95%|█████████▍| 163/172 [01:09<00:04,  2.12it/s]

SMILES for 918625-36-2: C1=CC(=CN=C1)[C@H](CC=CCO)N
82111-06-6


 95%|█████████▌| 164/172 [01:10<00:03,  2.21it/s]

SMILES for 82111-06-6: CC1=CC(=CN=C1)C2CCCN2C
1217540-34-5


 96%|█████████▌| 165/172 [01:11<00:03,  1.94it/s]

SMILES for 1217540-34-5: CN1[C@@H](C[C@H](C1=O)O[C@H]2[C@@H]([C@H](C=C(O2)C(=O)OC)O)O)C3=CN=CC=C3
1246812-39-4


 97%|█████████▋| 166/172 [01:11<00:02,  2.09it/s]

No compound found for CAS: 1246812-39-4
1076199-53-5


 97%|█████████▋| 167/172 [01:11<00:02,  2.03it/s]

SMILES for 1076199-53-5: CC(C)(C)OC(=O)N1CCCC1C2=CN=CC=C2
147732-32-9


 98%|█████████▊| 168/172 [01:12<00:01,  2.01it/s]

SMILES for 147732-32-9: CN1C(CCC1=O)C2=CC=CC=N2.OCl(=O)(=O)=O
887406-85-1


 98%|█████████▊| 169/172 [01:12<00:01,  2.03it/s]

SMILES for 887406-85-1: COC1=CC=C(C=C1)CN2C(CCC2=O)C3=CN=CC=C3
4315-37-1


 99%|█████████▉| 170/172 [01:13<00:00,  2.01it/s]

SMILES for 4315-37-1: CN1C=CC=C1C2=CN=CC=C2.[C@@H]([C@H](C(=O)O)O)(C(=O)O)O
107971-06-2


 99%|█████████▉| 171/172 [01:13<00:00,  2.09it/s]

SMILES for 107971-06-2: CC1=[N+](C=C(C=C1)C(=O)N)C.[I-]
857146-29-3


100%|██████████| 172/172 [01:14<00:00,  2.22it/s]

SMILES for 857146-29-3: CC1=CC(=CN=C1C)CO
20971-79-3


100%|██████████| 172/172 [01:14<00:00,  2.30it/s]

SMILES for 20971-79-3: C1CC(=O)OC1C2=CN=CC=C2
Total compounds: 172
Successfully converted: 164
Conversion rate: 95.35%
             cas                                             SMILES
0       494-97-3                           C1C[C@H](NC1)C2=CN=CC=C2
1       485-35-8                 C1[C@H]2CNC[C@@H]1C3=CC=CC(=O)N3C2
2     95091-91-1                             CN(C(=O)C1=CN=CC=C1)OC
3      6456-44-6                       C[N+]1=CC=CC(=C1)C(=O)N.[I-]
4       532-12-7                              C1CC(=NC1)C2=CN=CC=C2
..           ...                                                ...
486  887406-85-1             COC1=CC=C(C=C1)CN2C(CCC2=O)C3=CN=CC=C3
487    4315-37-1  CN1C=CC=C1C2=CN=CC=C2.[C@@H]([C@H](C(=O)O)O)(C...
488  107971-06-2                    CC1=[N+](C=C(C=C1)C(=O)N)C.[I-]
492  857146-29-3                                  CC1=CC(=CN=C1C)CO
493   20971-79-3                             C1CC(=O)OC1C2=CN=CC=C2

[164 rows x 2 columns]





In [50]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader as GeoDataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gnn import GNNModel, smiles_to_graph, MoleculeDataset

In [51]:
df = pd.read_csv('Unique_nicotine_derivatives_with_smiles.csv')
df_cleaned = df[df['SMILES'].notna() & (df['SMILES'] != '')]
df_cleaned.to_csv('cleaned_nicotine_derivatives_with_smiles.csv', index=False)

In [None]:
df = pd.read_csv('Unique_nicotine_derivatives_with_smiles.csv')
df_cleaned = df[df['SMILES'].notna() & (df['SMILES'] != '')]
df_cleaned.to_csv('cleaned_nicotine_derivatives_with_smiles.csv', index=False)

In [52]:
df = pd.read_csv('cleaned_nicotine_derivatives_with_smiles.csv')
nicotine_smiles = df['SMILES']

nicotine_graphs = [graph for graph in [smiles_to_graph(smile) for smile in nicotine_smiles] if graph is not None]

model_path = 'gnn_model.pth'
model = GNNModel(num_node_features=26, hidden_dim=128, output_dim=2)
model.load_state_dict(torch.load(model_path))
model.eval()

predictions = []

with torch.no_grad():
    for graph in nicotine_graphs:
        output = model(graph)
        _, predicted = torch.max(output, dim=1)
        predictions.append(predicted.item())

df['Predicted Label'] = predictions

df.to_csv('nicotine_derivatives_with_predictions.csv', index=False)

total = len(df)
predicted_count = len(predictions)
print(f"Total rows: {total}")
print(f"Predictions made: {predicted_count}")
print(f"Updated CSV saved with predictions.")

Total rows: 164
Predictions made: 164
Updated CSV saved with predictions.
