In [30]:
import os, sys
from datetime import date
import pubchempy as pcp 
import pandas as pd
from rdkit import Chem


In [16]:
today = date.today()
print ( "Today's date:", today)

Today's date: 2023-11-21


In [17]:
all_cids = [ x for x in open("../data/compunds_cids.txt", "r")]

In [18]:
def get_the_rdkit_features(df_to_edit):
    mol = Chem.MolFromSmiles(df_to_edit['canonical_smiles'][0])
    df_to_edit['Num_of_Rings'] = mol.GetRingInfo().NumRings()
    df_to_edit["heavy_atoms"] = mol.GetNumAtoms()
    return df_to_edit,mol
    
    

In [19]:
def get_the_type_of_bond(mol):
    counter_ar , counter_single, counter_double, counter_triple = 0,0,0,0
    for num ,m in enumerate ( mol.GetAtoms() ):
        print ( num, m.GetSymbol(), mol.GetAtomWithIdx(num).IsInRing(), mol.GetBonds()[num].GetBondType() )
        if str ( mol.GetBonds()[num].GetBondType() ) == "AROMATIC":
            counter_ar+=1
        elif str ( mol.GetBonds()[num].GetBondType() )  == "SINGLE":
            counter_single+=1
        elif str ( mol.GetBonds()[num].GetBondType() )== "DOUBLE":
            counter_double+=1
        else :
            counter_triple+=1
    df_temp = pd.DataFrame([counter_ar , counter_single, counter_double, counter_triple],
             index=['aromatic_bond','single_bond', 'double_bond', 'triple_bond' ])
    return df_temp.T
    # print (num, m.GetAtomicNum() , m.GetSymbol() , mol.GetAtomWithIdx(num).IsInRing() ,  mol.GetAtomWithIdx(num).IsInRingSize(5) , mol.GetAtomWithIdx(num).IsInRingSize(6)

In [20]:
my_properties_list = [ 
    'cid',
    'molecular_formula',
    'molecular_weight',
    'exact_mass',
    'monoisotopic_mass' ,
    'charge' ,
    'heavy_atom_count',
    'h_bond_donor_count',
    'h_bond_acceptor_count',
    'xlogp',
    'tpsa',
    'canonical_smiles',
    'complexity',
    'covalent_unit_count',
    'bonds',
    'elements'
                      ] 

In [21]:
line_number = sys.argv[1]
my_cids = all_cids[0]

In [22]:
c = pcp.Compound.from_cid(my_cids)
# get the information on a dictionary 
temp_dict = c.to_dict( properties= my_properties_list) 
# transform the dict into pandas dataframe 
df_to_edit = pd.DataFrame.from_dict(data=temp_dict,orient='index').T

In [23]:
element_type = pd.get_dummies( pd.Series ( df_to_edit['elements'].iloc[0]) ,dtype=float ).sum()

In [24]:
df_to_edit['all_atoms_count']= df_to_edit['elements'].apply( lambda x:len(x))
df_to_edit['all_atoms_count_unique'] = df_to_edit['elements'].apply( lambda x: len ( list(dict.fromkeys(x)) ) )
# df_to_edit.drop(['bonds','elements'],axis=1 ,inplace=True)

In [25]:
# pd.concat( [df_to_edit ,element_type.to_frame().T],axis=1)

In [26]:
# for m in mol.GetAromaticAtoms():
#     print (m.GetSymbol())

In [27]:
df_to_edit_2 , mol  = get_the_rdkit_features(df_to_edit=df_to_edit)
df_to_edit_3 = get_the_type_of_bond(mol)

0 C True AROMATIC
1 C True AROMATIC
2 S True AROMATIC
3 C True AROMATIC
4 C True AROMATIC
5 N True AROMATIC
6 C True AROMATIC
7 C True SINGLE
8 C False DOUBLE
9 O False SINGLE
10 O False AROMATIC


In [32]:
final_dataframe = pd.concat( [df_to_edit_2, df_to_edit_3,element_type.to_frame().T],axis=1)
final_dataframe.drop(['bonds','elements'],axis=1 ,inplace=True)

In [33]:
final_dataframe.to_csv(f"../data/premilinar_data_{my_cids}.csv", index=None )