In [1]:
import os 
import pandas as pd
import zipfile
import requests
import time
from tqdm import tqdm
import pubchempy as pcp

In [None]:
def smile_generator(data_path="../data_csv",data_file='Products.csv'):
    df = pd.read_csv("/".join(data_path,data_file))
    df['ProductID'] = df.index
    ActiveIngredient_list = df['ActiveIngredient'].tolist()

    results = {'ProductID': [], 'smiles': []}

    # Loops through products
    for index, ingredients in tqdm(list(enumerate(ActiveIngredient_list)), total=len(ActiveIngredient_list), desc="Fetching SMILES"):
        start = time.time()
        
        # Loops through many active ingredients in 1 product
        ingredients = ingredients.split('; ')
        for name in ingredients:
            compounds = pcp.get_compounds(name, 'name')
            duration = time.time() - start

            if compounds:
                results['smiles'].append(compounds[0].isomeric_smiles)
                results['ProductID'].append(index)

            else:
                tqdm.write(f"{index}: {name} -> Not found (took {duration:.2f}s)")

    df_smile = pd.DataFrame(results)
    df_smile.to_csv("/".join(data_path,'Smiles.csv'), index=False)

In [17]:
compounds = pcp.get_compounds('SULFADIAZINE', 'name')
compounds[0].cid

5215

In [16]:
compounds = pcp.get_compounds('SULFACETAMIDE', 'name')
compounds

[Compound(5320)]

In [37]:
a = 'TRIPLE SULFA (SULFABENZAMIDE;SULFACETAMIDE;SULFATHIAZOLE)'
b = 'AMPHETAMINE ASPARTATE; AMPHETAMINE SULFATE; DEXTROAMPHETAMINE SACCHARATE; DEXTROAMPHETAMINE SULFATE'
c = 'active ingredient'

In [29]:
a.split(' ')[-1].replace('(','').replace(')','').split(';')

['SULFABENZAMIDE', 'SULFACETAMIDE', 'SULFATHIAZOLE']

In [None]:
def extract_ingredients(text):
    if '(' in text and ')' in text:
        # Extract the part inside parentheses
        inside = text[text.find('(')+1:text.find(')')]
        return [item.strip() for item in inside.split(';')]
    else:
        return [item.strip() for item in text.split(';')]


In [40]:
extract_ingredients(c)

['active ingredient']

In [13]:
from data_processor import DataProcessor

In [15]:
processor = DataProcessor(data_file='Products_draft.csv')

In [17]:
processor.smile_generator()

Fetching SMILES:   9%|▉         | 32/346 [00:17<02:47,  1.88it/s]

186: PROTAMINE SULFATE -> Not found (took 0.37s)


Fetching SMILES:  44%|████▍     | 152/346 [01:32<01:29,  2.17it/s]

473: ETHIODIZED OIL -> Not found (took 0.39s)


Fetching SMILES:  75%|███████▍  | 259/346 [02:28<00:51,  1.71it/s]

835: SODIUM POLYSTYRENE SULFONATE -> Not found (took 0.38s)


Fetching SMILES:  77%|███████▋  | 267/346 [02:35<01:02,  1.27it/s]

868: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.36s)


Fetching SMILES:  77%|███████▋  | 268/346 [02:37<01:27,  1.13s/it]

869: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.29s)


Fetching SMILES:  78%|███████▊  | 269/346 [02:39<01:45,  1.37s/it]

870: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.27s)


Fetching SMILES:  78%|███████▊  | 270/346 [02:41<01:55,  1.52s/it]

871: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.81s)


Fetching SMILES:  78%|███████▊  | 271/346 [02:43<02:14,  1.80s/it]

872: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.35s)


Fetching SMILES:  79%|███████▊  | 272/346 [02:46<02:15,  1.84s/it]

873: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.78s)


Fetching SMILES:  79%|███████▉  | 273/346 [02:48<02:25,  1.99s/it]

874: DEXTROAMPHETAMINE SACCHARATE -> Not found (took 1.31s)


Fetching SMILES: 100%|██████████| 346/346 [03:25<00:00,  1.68it/s]


In [20]:
processor.smile_standardizer

AttributeError: 'DataProcessor' object has no attribute 'smile_standardizer'

In [21]:
processor.df_smile

Unnamed: 0,ProductID,smiles
0,19,CCC(C)C1(C(=O)NC(=NC1=O)[O-])CC.[Na+]
1,35,C1C[C@H](N(C1)C(=O)[C@@H]2CSSC[C@@H](C(=O)N[C@...
2,36,C[C@H](/C=C/[C@H](C)C(C)C)[C@H]1CC[C@@H]\2[C@@...
3,72,C1=CN=C(N=C1)NS(=O)(=O)C2=CC=C(C=C2)N
4,78,C[C@]12CC[C@H]3C(=CCC4=C3C=CC(=C4)OS(=O)(=O)[O...
...,...,...
384,995,C[C@H]1C[C@H]2[C@@H]3CC[C@@]([C@]3(C[C@@H]([C@...
385,996,C[C@H]1C[C@H]2[C@@H]3CC[C@@]([C@]3(C[C@@H]([C@...
386,997,C[C@H]1C[C@H]2[C@@H]3CC[C@@]([C@]3(C[C@@H]([C@...
387,998,C[C@H]1C[C@H]2[C@@H]3CC[C@@]([C@]3(C[C@@H]([C@...
