In [1]:
import pandas as pd     
from classyfire import *
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdMD

In [2]:
df = pd.read_csv("Euphorbia_Example_inhouse_database.txt",  sep='\t', header=None)
df.shape

(1508, 2)

In [3]:
df.head()

Unnamed: 0,0,1
0,CC1=C(O)C=C(OC)C(C(C)=O)=C1O,C10H12O4_Euphorbia_ebracteolata_1
1,"CC(C1=C(O)C=C(OC)C=C1OC)=O,COC1=CC(OC)=CC(O)=C...","C10H12O4_Euphorbia_quinquecostata,portulacoides_2"
2,OC1=C(OC)C=C2C(OC(C=C2)=O)=C1,C10H8O4_Euphorbia_lunulata_3
3,O=C1OC2=CC(OC)=C(O)C=C2C=C1,C10H8O4_Euphorbia_quinquecostata_4
4,CC(C1=C(OC)C=C(OC)C=C1OC)=O,C11H14O4_Euphorbia_portulacoides_5


In [9]:
smi = list(df[0])
m = [Chem.MolFromSmiles(x) for x in smi]
inchi = []
ikeys = []
ikey1 = []
ikey2 = []
form = []
exmass = []
for i in range(len(m)):
    try:
        inchi.append(Chem.rdinchi.MolToInchi(m[i])[0])
        ikey = Chem.rdinchi.InchiToInchiKey(inchi[i])
        ikeys.append(ikey)
        ikey1.append(ikey.split('-')[0])
        ikey2.append(ikey.split('-')[1])
    except:
        ikeys.append('')
        inchi.append('')
        ikey1.append('')
        ikey2.append('')
    try:
        form.append(rdMD.CalcMolFormula(m[i]))
    except:
        form.append('')
    try:
        exmass.append(rdMD.CalcExactMolWt(m[i]))
    except:
        exmass.append('')


In [10]:
data = {'inchikey': ikeys, 'MonoisotopicMass': exmass, 'InChI': inchi, 'SMILES': list(df[0]),
              'Identifier': list(df[1]), 'InChIKey2': ikey2, 'InChIKey1': ikey1, 'MolecularFormula': form}

cn = ["inchikey", "MonoisotopicMass", "InChI", "SMILES", "Identifier", "InChIKey2", "InChIKey1", "MolecularFormula"]
formdata = pd.DataFrame(data, columns=cn)
formdata.head()

Unnamed: 0,inchikey,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula
0,RFKMWWMZUHXFBA-UHFFFAOYSA-N,196.074,InChI=1S/C10H12O4/c1-5-7(12)4-8(14-3)9(6(2)11)...,CC1=C(O)C=C(OC)C(C(C)=O)=C1O,C10H12O4_Euphorbia_ebracteolata_1,UHFFFAOYSA,RFKMWWMZUHXFBA,C10H12O4
1,,,,"CC(C1=C(O)C=C(OC)C=C1OC)=O,COC1=CC(OC)=CC(O)=C...","C10H12O4_Euphorbia_quinquecostata,portulacoides_2",,,
2,RODXRVNMMDRFIK-UHFFFAOYSA-N,192.042,InChI=1S/C10H8O4/c1-13-9-4-6-2-3-10(12)14-8(6)...,OC1=C(OC)C=C2C(OC(C=C2)=O)=C1,C10H8O4_Euphorbia_lunulata_3,UHFFFAOYSA,RODXRVNMMDRFIK,C10H8O4
3,SYTYLPHCLSSCOJ-UHFFFAOYSA-N,192.042,InChI=1S/C10H8O4/c1-13-9-5-8-6(4-7(9)11)2-3-10...,O=C1OC2=CC(OC)=C(O)C=C2C=C1,C10H8O4_Euphorbia_quinquecostata_4,UHFFFAOYSA,SYTYLPHCLSSCOJ,C10H8O4
4,KPZWHZSIXZXDMW-UHFFFAOYSA-N,210.089,InChI=1S/C11H14O4/c1-7(12)11-9(14-3)5-8(13-2)6...,CC(C1=C(OC)C=C(OC)C=C1OC)=O,C11H14O4_Euphorbia_portulacoides_5,UHFFFAOYSA,KPZWHZSIXZXDMW,C11H14O4


In [11]:
classy = query_inchikey(ikeys)

In [None]:
# If the structure do not show a classification, try query
#in_process = get_class(list(df[0]), chunksize=100)
#classy = poll(in_process)

In [12]:
classy.shape

(1508, 7)

In [13]:
classy.head()

Unnamed: 0,class,direct_parent,inchikey,kingdom,molecular_framework,subclass,superclass
0,Organooxygen compounds,Alkyl-phenylketones,RFKMWWMZUHXFBA-UHFFFAOYSA-N,Organic compounds,Aromatic homomonocyclic compounds,Carbonyl compounds,Organic oxygen compounds
1,,,,,,,
2,Coumarins and derivatives,7-hydroxycoumarins,RODXRVNMMDRFIK-UHFFFAOYSA-N,Organic compounds,Aromatic heteropolycyclic compounds,Hydroxycoumarins,Phenylpropanoids and polyketides
3,Coumarins and derivatives,Hydroxycoumarins,SYTYLPHCLSSCOJ-UHFFFAOYSA-N,Organic compounds,Aromatic heteropolycyclic compounds,Hydroxycoumarins,Phenylpropanoids and polyketides
4,Organooxygen compounds,Alkyl-phenylketones,KPZWHZSIXZXDMW-UHFFFAOYSA-N,Organic compounds,Aromatic homomonocyclic compounds,Carbonyl compounds,Organic oxygen compounds


In [15]:
classy = classy[['inchikey', 'kingdom', 'superclass', 'class', 'subclass']]
classy.columns = ['inchikey', 'kingdom_name', 'superclass_name', 'class_name', 'subclass_name']

In [17]:
formfinal = pd.merge(formdata, classy, how='left', on=['inchikey'])
formfinal.tail()

Unnamed: 0,inchikey,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula,kingdom_name,superclass_name,class_name,subclass_name
1505,BWXDELRNNYLLKB-UHFFFAOYSA-N,153.115,InChI=1S/C9H15NO/c1-9-4-2-3-7(10-9)5-8(11)6-9/...,CC12CCCC(CC(=O)C1)N2,C9H15NO_Euphorbia_atoto_1504,UHFFFAOYSA,BWXDELRNNYLLKB,C9H15NO,Organic compounds,Organoheterocyclic compounds,Piperidines,Piperidinones
1506,YEGNHFRGBNUOLZ-UHFFFAOYSA-N,185.105,InChI=1S/C9H15NO3/c1-3-4-5-13-9-7(11)6(2)8(12)...,CCCCOC1C(C(=C)C(=N1)O)O,C9H15NO3_Euphorbia_humifusa_1505,UHFFFAOYSA,YEGNHFRGBNUOLZ,C9H15NO3,Organic compounds,Organoheterocyclic compounds,Pyrrolidines,Pyrrolidones
1507,JWUXJYZVKZKLTJ-UHFFFAOYSA-N,155.131,"InChI=1S/C9H17NO/c1-8(2)5-7(11)6-9(3,4)10-8/h1...",CC1(C)CC(=O)CC(C)(C)N1,C9H17NO_Multiple__1506,UHFFFAOYSA,JWUXJYZVKZKLTJ,C9H17NO,Organic compounds,Organoheterocyclic compounds,Piperidines,Piperidinones
1508,ORHBXUUXSCNDEV-UHFFFAOYSA-N,162.032,InChI=1S/C9H6O3/c10-7-3-1-6-2-4-9(11)12-8(6)5-...,c1cc(cc2c1ccc(=O)o2)O,C9H6O3_Multiple__1507,UHFFFAOYSA,ORHBXUUXSCNDEV,C9H6O3,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Hydroxycoumarins
1509,GLEMZFGSPKRSOO-UHFFFAOYSA-N,178.027,InChI=1S/C9H6O4/c10-6-2-1-5-3-7(11)9(12)13-8(5...,c1cc(cc2c1cc(c(=O)o2)O)O,C9H6O4_Euphorbia_terracina_1508,UHFFFAOYSA,GLEMZFGSPKRSOO,C9H6O4,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Hydroxycoumarins


In [20]:
formfinal = formfinal.fillna('')
formfinal.drop('inchikey', axis=1, inplace=True)
formfinal.head()

Unnamed: 0,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula,kingdom_name,superclass_name,class_name,subclass_name
0,196.074,InChI=1S/C10H12O4/c1-5-7(12)4-8(14-3)9(6(2)11)...,CC1=C(O)C=C(OC)C(C(C)=O)=C1O,C10H12O4_Euphorbia_ebracteolata_1,UHFFFAOYSA,RFKMWWMZUHXFBA,C10H12O4,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
1,,,"CC(C1=C(O)C=C(OC)C=C1OC)=O,COC1=CC(OC)=CC(O)=C...","C10H12O4_Euphorbia_quinquecostata,portulacoides_2",,,,,,,
2,192.042,InChI=1S/C10H8O4/c1-13-9-4-6-2-3-10(12)14-8(6)...,OC1=C(OC)C=C2C(OC(C=C2)=O)=C1,C10H8O4_Euphorbia_lunulata_3,UHFFFAOYSA,RODXRVNMMDRFIK,C10H8O4,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Hydroxycoumarins
3,192.042,InChI=1S/C10H8O4/c1-13-9-5-8-6(4-7(9)11)2-3-10...,O=C1OC2=CC(OC)=C(O)C=C2C=C1,C10H8O4_Euphorbia_quinquecostata_4,UHFFFAOYSA,SYTYLPHCLSSCOJ,C10H8O4,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Hydroxycoumarins
4,210.089,InChI=1S/C11H14O4/c1-7(12)11-9(14-3)5-8(13-2)6...,CC(C1=C(OC)C=C(OC)C=C1OC)=O,C11H14O4_Euphorbia_portulacoides_5,UHFFFAOYSA,KPZWHZSIXZXDMW,C11H14O4,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds


In [25]:
id = [x for x in range(len(ikeys)) if ikeys[x]=='']
formfinal.drop(formfinal.index[id], inplace=True)
formfinal.head()

Unnamed: 0,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula,kingdom_name,superclass_name,class_name,subclass_name
0,196.074,InChI=1S/C10H12O4/c1-5-7(12)4-8(14-3)9(6(2)11)...,CC1=C(O)C=C(OC)C(C(C)=O)=C1O,C10H12O4_Euphorbia_ebracteolata_1,UHFFFAOYSA,RFKMWWMZUHXFBA,C10H12O4,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
2,192.042,InChI=1S/C10H8O4/c1-13-9-4-6-2-3-10(12)14-8(6)...,OC1=C(OC)C=C2C(OC(C=C2)=O)=C1,C10H8O4_Euphorbia_lunulata_3,UHFFFAOYSA,RODXRVNMMDRFIK,C10H8O4,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Hydroxycoumarins
3,192.042,InChI=1S/C10H8O4/c1-13-9-5-8-6(4-7(9)11)2-3-10...,O=C1OC2=CC(OC)=C(O)C=C2C=C1,C10H8O4_Euphorbia_quinquecostata_4,UHFFFAOYSA,SYTYLPHCLSSCOJ,C10H8O4,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Hydroxycoumarins
4,210.089,InChI=1S/C11H14O4/c1-7(12)11-9(14-3)5-8(13-2)6...,CC(C1=C(OC)C=C(OC)C=C1OC)=O,C11H14O4_Euphorbia_portulacoides_5,UHFFFAOYSA,KPZWHZSIXZXDMW,C11H14O4,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
5,210.089,InChI=1S/C11H14O4/c1-6-8(14-3)5-9(15-4)10(7(2)...,CC(C1=C(O)C(C)=C(OC)C=C1OC)=O,C11H14O4_Euphorbia_portulacoides_6,UHFFFAOYSA,AAOFJKLTRKOQTQ,C11H14O4,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds


In [26]:
formfinal.to_csv('Euphorbia_Example_inhouse_database_FORMATED.txt', index=False, sep='\t')