In [4]:
!pip install rdkit
!pip install mordred
!pip install pubchempy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
import numpy as np
import pubchempy as pcp
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import rdMolDescriptors
from urllib.request import urlopen
from urllib.parse import quote

For homework, we chose 8 datasets

In [6]:
df = pd.read_csv('/content/8.csv')

## Data Preparation

For now, we remove all lines where there is nothing in `Name`

In [7]:
df = df[~df['Name'].isna()]
df = df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1039/C6RA04584G_© The Royal Society of Chem...,,,,silicate,0.04,,snowball,,1.21 – 1.13,refractive index
1,10.1016/j.mssp.2013.07.031,8/24/2013,Materials Science in Semiconductor Processing,PREPARATIONCHARACTERIZATIONAPPLICATIONRFSPUTTE...,BSG,0.00,,el_mylogic,"[['B', 1.0], ['SG', 1.0]]",1.41,Refractive index (n)
2,10.1039/C7NR03175K,,,,silica,0.00,,snowball,O=[Si]=O,1.48,refraction index
3,10.1016/j.saa.2012.10.018,10/24/2012,Spectrochimica Acta Part A: Molecular and Biom...,SOLVENTEFFECTABSORPTIONFLUORESCENCESPECTRA7ACE...,DMF,0.00,,el_mylogic,CN(C)C=O,1.4305,n
4,10.1016/j.conbuildmat.2018.05.016Construction ...,,,,silica,0.00,,snowball,O=[Si]=O,1.46,refractive index
...,...,...,...,...,...,...,...,...,...,...,...
4975,10.1016/j.poly.2011.03.003,3/17/2011,Polyhedron,PHOTOPHYSICALPROPERTIESPHOTOCYTOTOXICITYNOVELP...,2-Propanol,0.00,,el_mylogic,,1.377,Refractive index (20 °C)
4976,10.1016/j.optlastec.2019.01.028,2/6/2019,Optics & Laser Technology,SPECTROCHEMICALSTUDYMOLDAVITESRIESIMPACTSTRUCT...,MgO,0.00,,el_cde_tables,"[['Mg', 1.0], ['O', 1.0]]",1.75,n
4977,10.1016/j.ssc.2010.03.031,4/8/2010,Solid State Communications,BROADBAND15MEMISSIONHIGHERBIUMDOPEDBI2O3B2O3GA...,Silicate,0.00,,el_cde_tables,,1.585,Refractive index
4978,10.1016/j.molliq.2017.05.096,5/23/2017,Journal of Molecular Liquids,SOLVATOCHROMICANALYSISDFTCOMPUTATIONALSTUDYAZO...,Diethylether,0.00,,el_mylogic,CCOCC,1.3526,n


With the help of PubChem, we will add smiles that are not in the table.

In [9]:
def smiles(x):
  try:
    return pcp.get_properties('CanonicalSMILES', x, 'name')[0]['CanonicalSMILES']
  except IndexError:
    return np.nan

df.loc[df['normalised_name'].isna(), 'normalised_name'] = df.loc[df['normalised_name'].isna(),'Name'].apply(smiles)

It takes a long time to fill, save the dataset for further work.

In [10]:
df.to_csv('/content/df_1.csv', index=False)

In [14]:
df = pd.read_csv('/content/df_1.csv')

Substances can often be repeated, take unique

In [11]:
df_names = df.loc[df['Name'].drop_duplicates().index, ['Name', 'normalised_name']]

We have several problems:

* some smiles are missing
* some smiles are incorrect and libraries won't parse them
* and some smiles are wrong, but the libraries count them

With the first one, we can’t do anything yet, we’ll work with them further.

With the second ones, you can try to run the translation of the substance into smiles in pubchem and see if they are the same if the substance is found.

In [12]:
df_names['smiles'] = np.nan
df_names.loc[~df_names['normalised_name'].isna(), 'smiles'] = df_names.loc[~df_names['normalised_name'].isna(),'Name'].apply(smiles)

There are obviously a lot of discrepancies, some are rather a variant of the smiley difference, some are clearly a mistake, we will replace them

In [13]:
d = df_names[df_names['smiles'] != df_names['normalised_name']]
d[~d['smiles'].isna()]

Unnamed: 0,Name,normalised_name,smiles
1,BSG,"[['B', 1.0], ['SG', 1.0]]",[B-](CNS(=O)(=O)CC1=CC=C(C=C1)C(=O)O)(O)(O)O
5,ErF3,"[['Er', 1.0], ['F', 3.0]]",F[Er](F)F
25,CO2,O=C=O,C(=O)=O
29,TOL,COc1ccc2c(cccc2c1C(F)(F)F)C(=S)N(C)CC(O)=O,CN(CC(=O)O)C(=S)C1=CC=CC2=C1C=CC(=C2C(F)(F)F)OC
32,UREA,NC(N)=O,C(=O)(N)N
...,...,...,...
4868,IrO2,"[['Ir', 1.0], ['O', 2.0]]",O=[Ir]=O
4906,Trona,[Na+].[Na+].[O-]C([O-])=O,C(=O)(O)[O-].C(=O)([O-])[O-].O.O.[Na+].[Na+].[...
4946,Fluoranthene,c1ccc2c(c1)c3cccc4cccc2c34,C1=CC=C2C(=C1)C3=CC=CC4=C3C2=CC=C4
4962,hydroxyl,[OH-],[OH]


In [14]:
df_names.loc[d[~d['smiles'].isna()].index, 'normalised_name'] = df_names.loc[d[~d['smiles'].isna()].index, 'smiles']
df_names = df_names.drop('smiles', axis=1)

With the third option, you can run rdkit so that it tries to build an instance of the smiles molecule

In [15]:
def check_smiles(smile):
    mol = Chem.MolFromSmiles(smile)
    if mol:
      return True
    else:
      return False

df_names['check_smiles'] = np.nan
df_names.loc[~df_names['normalised_name'].isna(), 'check_smiles'] = df_names.loc[~df_names['normalised_name'].isna(),'normalised_name'].apply(check_smiles)

[12:44:13] SMILES Parse Error: syntax error while parsing: [['Al',
[12:44:13] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[12:44:13] SMILES Parse Error: syntax error while parsing: [['Cn',
[12:44:13] SMILES Parse Error: Failed parsing SMILES '[['Cn',' for input: '[['Cn','
[12:44:13] SMILES Parse Error: syntax error while parsing: [['IL',
[12:44:13] SMILES Parse Error: Failed parsing SMILES '[['IL',' for input: '[['IL','
[12:44:13] SMILES Parse Error: syntax error while parsing: [['Al',
[12:44:13] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[12:44:13] SMILES Parse Error: syntax error while parsing: [['P',
[12:44:13] SMILES Parse Error: Failed parsing SMILES '[['P',' for input: '[['P','
[12:44:13] SMILES Parse Error: syntax error while parsing: [['Mg',
[12:44:13] SMILES Parse Error: Failed parsing SMILES '[['Mg',' for input: '[['Mg','
[12:44:13] SMILES Parse Error: syntax error while parsing: [['B',
[12:44:13] SMILES Parse Error: 

Some can be filled in by searching from `cactus.nci.nih.gov/`

In [16]:
def CIRconvert(x):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(x) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        print(x, ans)
        return ans
    except:
        return np.nan

df_names.loc[df_names['normalised_name'].isna(), 'normalised_name'] = df_names.loc[df_names['normalised_name'].isna(),'Name'].apply(CIRconvert)

silicate [O-][Si]([O-])([O-])[O-]
MPa COc1c(C)c2COC(=O)c2c(O)c1C\C=C(C)\CCC(O)=O
germanium [GeH4]
CoO CC(C)CO[P](O)(=O)O[P](O)(=O)OC[C@H]1O[C@H]([C@H](O)[C@@H]1O[P](O)(O)=O)n2cnc3c(N)ncnc23
Glycerol OCC(O)CO
TiO2 O=[Ti]=O
quartz O=[Si]=O
Cysteamine NCCS
Ibuprofen CC(C)Cc1ccc(cc1)C(C)C(O)=O
1-Iodonaphthalene sulfur S.Ic1cccc2ccccc12
flavonoid Oc1ccc(cc1)C2=CC(=O)c3c(O)c(O)c(O)cc3O2
SnS2nSnS2 SN[S]1NSNS1
Octamethyl trisiloxane C[Si](C)(C)O[Si](C)(C)O[Si](C)(C)C
germaniumsulphide S=[GeH2]
SiC C=[SiH2]
Tetrahydrofurane C1CCOC1
alcohols CCO
Tetramethylammonium hydroxide [OH-].C[N+](C)(C)C
n-Nonane CCCCCCCCC
1-Hexanol CCCCCCO
KRI N[C@@H](CCSC[C@@H](O)[C@@H](O)C(=O)CO)C(O)=O
polypropylene CC=C
WO3 [OH2-].[OH2-].[OH2-].[OH2-].[W+4]
1-Bromonaphthalene Brc1cccc2ccccc12
Olivine [Mg].[Mg].[Mg].[Fe].O[Si](O)(O)O.O[Si](O)(O)O
Acetyl C[C]=O
Dakar Cc1c(C[S](=O)c2[nH]c3ccccc3n2)nccc1OCC(F)(F)F
SrO O=[Sr]
CsI [I-].[Cs+]
Aurum [Au]
silicon oxide O=[Si]=O
thiols S
PSS PSS
Cholesterol CC(C)CCC[C@@H](C)[C@H

It also replaced some incorrectly, we will fix it manually

In [55]:
df_names.loc[df_names[df_names["Name"] == 'germanium'].index, 'normalised_name'] = '[Ge]'
df_names.loc[df_names[df_names["Name"] == 'CoO'].index, 'normalised_name'] = 'O=[Co] '
df_names.loc[df_names[df_names["Name"] == 'SnS2nSnS2'].index, 'normalised_name'] = 'S=[Sn]'
df_names.loc[df_names[df_names["Name"] == 'SiC'].index, 'normalised_name'] = '[C-]#[Si+]'
df_names.loc[df_names[df_names["Name"] == 'ethers'].index, 'normalised_name'] = 'not found'
df_names.loc[df_names[df_names["Name"] == 'TiC'].index, 'normalised_name'] = '[C-]#[Ti+]'
df_names.loc[df_names[df_names["Name"] == 'Tb(C)'].index, 'normalised_name'] = '[Tb]'
df_names.loc[df_names[df_names["Name"] == 'NaC'].index, 'normalised_name'] = '[C-]#[Na+]' 
df_names.loc[df_names[df_names["Name"] == 'Selenium'].index, 'normalised_name'] = '[Se]' 
df_names.loc[df_names[df_names["Name"] == 'CdO'].index, 'normalised_name'] = 'O=[Cd]'
df_names.loc[df_names[df_names["Name"] == 'KI'].index, 'normalised_name'] = '[K+].[I-]'

In [56]:
df_names.to_csv('df_names.csv', index=False)

Returning to the first problem, when we simply do not have smiles in various libraries. We use a parser and find the smiles in the puchem search page, save them and add them here.

In [19]:
pubchem_smiles = pd.read_csv('/content/pubchem_smiles.csv').rename(columns={'0': 'smiles'})

In [20]:
pubchem_smiles

Unnamed: 0,Name,smiles
0,PbO-SiO2,
1,PVAc,
2,MoS2,S=[Mo]=S
3,CdTe,[Cd]=[Te]
4,N2 PDA,
...,...,...
1036,Zn2S28Se70,
1037,MgO,O=[Mg]
1038,K2O,[O-2].[K+].[K+]
1039,FeO,O=[Fe]


In [57]:
d = pd.merge(df_names, pubchem_smiles, how = 'left', on='Name')

In [58]:
df_names = df_names.reset_index(drop=True)

In [59]:
df_names.loc[d[~d['smiles'].isna()].index, 'normalised_name'] = d[~d['smiles'].isna()]['smiles'].values

After all the edits, we are left with names that are either not really present in the selected libraries or search engines, or are not displayed correctly. It is better to correct the names of the substances first.
Potentially in the future it is possible to correct the following options:
* a mixture of metals `ZnAlBiBTb` or `Ge17As18S26Se39` to collect on separate metal smileys
* compound `Na2O–P2O5` also connect smileys in parts
* remove the trailing ' -' and other characters and run the parser again, for example

In [60]:
df_names[(df_names['check_smiles'].isna()) | (df_names['check_smiles'] is False)][150:180]

Unnamed: 0,Name,normalised_name,check_smiles
378,OxBPA,,
379,Dy2O3,[O-2].[O-2].[O-2].[Dy+3].[Dy+3],
381,Zn(1−x)CaxO,"[['Cax', 1.0], ['O', 1.0], ['Zn', 1.0]]",
386,K2BiI5O15,,
387,ZnAlBiBTb1.0,"[['Al', 1.0], ['B', 1.0], ['Bi', 1.0], ['Tb', ...",
390,Sb2O3–B2O3,,
391,Aurum,[Au],
392,silicon oxide,O=[Si]=O,
394,thiols,S,
395,As2Se3,[As](=[Se])[Se][As]=[Se],


Further verification of the dataset is highly dependent on the task that we face within the framework of the projects. We did the most basic:

* removed duplicates
* removed lines with empty names of substances
* as far as possible within the set time put smiles in order

We still have such columns as `measurement_error`, `measurement_wavelength`, `measurement_method`, `raw_value` and `specifier`.
It looks like the refractive index was measured here, by certain methods with the indicated error, if possible. Since the task of working with this indicator is not clear, outside the tasks you can look at the spread of `raw_value` and look at outliers and delete them as invalid data.

## Descriptors

### RDkit

In RDkit, you can pull descriptors in several ways.
The first one is from `rdMolDescriptors`.

In [61]:
df_smiles = df_names[~df_names['normalised_name'].isna()]
df_smiles

Unnamed: 0,Name,normalised_name,check_smiles
0,silicate,[O-][Si]([O-])([O-])[O-],
1,BSG,[B-](CNS(=O)(=O)CC1=CC=C(C=C1)C(=O)O)(O)(O)O,True
2,silica,O=[Si]=O,True
3,DMF,CN(C)C=O,True
4,ErF3,F[Er](F)F,True
...,...,...,...
1923,Triticonazole,CC1(CCC(=CC2=CC=C(C=C2)Cl)C1(CN3C=NC=N3)O)C,True
1924,In0.156Ga0.843N0.062As0.937,"[['As', 0.937], ['Ga', 0.843], ['In', 0.156], ...",
1926,tert-Butylmethylether,COC(C)(C)C,True
1927,hydroxyl,[OH],True


In [63]:
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)
descriptors_dict = {'normalised_name': []}
descriptors_dict.update({name:[] for name in descriptor_names})
descriptors_df = pd.DataFrame(descriptors_dict)

def to_descriptors(smile):
    mol = Chem.MolFromSmiles(smile)
    descriptors = np.array([smile])
    
    if mol:
      descriptors = np.append(descriptors, np.array(get_descriptors.ComputeProperties(mol)))
    else:
      empty_arr = np.empty([len(descriptor_names),])
      empty_arr[:] = np.nan
      descriptors = np.append(descriptors, empty_arr)
    
    return pd.concat([descriptors_df, pd.DataFrame(dict(zip(['normalised_name'] + descriptor_names, descriptors)), index=[0])], ignore_index=True)

In [64]:
for name in df_smiles['normalised_name']:
  descriptors_df = to_descriptors(name)

[13:27:40] SMILES Parse Error: syntax error while parsing: [['Al',
[13:27:40] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[13:27:40] SMILES Parse Error: syntax error while parsing: [['Cn',
[13:27:40] SMILES Parse Error: Failed parsing SMILES '[['Cn',' for input: '[['Cn','
[13:27:40] SMILES Parse Error: syntax error while parsing: [['IL',
[13:27:40] SMILES Parse Error: Failed parsing SMILES '[['IL',' for input: '[['IL','
[13:27:40] SMILES Parse Error: syntax error while parsing: [['Al',
[13:27:40] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[13:27:40] SMILES Parse Error: syntax error while parsing: [['P',
[13:27:40] SMILES Parse Error: Failed parsing SMILES '[['P',' for input: '[['P','
[13:27:40] SMILES Parse Error: syntax error while parsing: [['B',
[13:27:40] SMILES Parse Error: Failed parsing SMILES '[['B',' for input: '[['B','
[13:27:40] SMILES Parse Error: syntax error while parsing: [['Eu',
[13:27:40] SMILES Parse Error: Fa

In [65]:
descriptors_df

Unnamed: 0,normalised_name,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,[O-][Si]([O-])([O-])[O-],91.95877932964001,92.082,4.0,0.0,0.0,0.0,4.0,5.0,5.0,...,2.1329931618554525,0.8164965809277261,0.0,0.0,0.0,0.056883116883116876,5.0568831168831165,1.0333603085540413,5304.50608142384,1.0451164595968119
1,[B-](CNS(=O)(=O)CC1=CC=C(C=C1)C(=O)O)(O)(O)O,290.05112673591,290.082,8.0,5.0,6.0,5.0,6.0,19.0,32.0,...,9.72281245217651,5.215182502079802,2.1346807436727064,2.1346807436727064,1.252765169078102,-1.4550649350649354,15.60193157112326,5.673872297580559,5.809849272321278,4.659124596323357
2,O=[Si]=O,59.966755770000006,60.084,2.0,0.0,0.0,0.0,2.0,3.0,3.0,...,1.3164965809277263,0.4082482904638631,0.0,0.0,0.0,-0.18311688311688312,2.8168831168831163,1.8168831168831163,1.8168831168831188,1.7059824590993413
3,CN(C)C=O,73.052763844,73.095,2.0,0.0,1.0,0.0,1.0,5.0,12.0,...,3.4328121551534467,1.3883283411425928,0.210818510677892,0.210818510677892,0.0,-0.53,4.469999999999999,1.7581844380403455,3.469999999999999,1.5718168876080685
4,F[Er](F)F,222.92550276,224.25399999999996,0.0,0.0,0.0,0.0,0.0,4.0,4.0,...,1.4225685536224946,0.3273268353539886,0.0,0.0,0.0,1.0367532467532465,5.036753246753246,2.2844770829337953,11.720213711226094,2.876586841150045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1668,CC1(CCC(=CC2=CC=C(C=C2)Cl)C1(CN3C=NC=N3)O)C,317.12948994,317.82,4.0,1.0,3.0,1.0,4.0,22.0,42.0,...,12.829591082896085,7.420990005275443,4.738472733352211,4.738472733352211,3.1115254335239118,-1.6500000000000001,15.253562902571957,5.52204096031731,2.6858418961185153,3.828672688126315
1669,"[['As', 0.937], ['Ga', 0.843], ['In', 0.156], ...",,,,,,,,,,...,,,,,,,,,,
1670,COC(C)(C)C,88.088815004,88.14999999999999,1.0,0.0,0.0,0.0,1.0,6.0,18.0,...,4.908248290463863,2.1123724356957947,0.6123724356957946,0.6123724356957946,0.0,-0.04,5.959999999999999,1.6056599286563615,3.9600000000000004,1.5949555291319857
1671,[OH],17.002739652,17.007,1.0,1.0,0.0,1.0,0.0,1.0,2.0,...,0.4472135954999579,0.0,0.0,0.0,0.0,-0.04,0.9600000000000016,-27.040000000000028,-104.0400000000001,-25.958400000000072


In [48]:
# descriptors_df = pd.read_csv('/content/descriptors_df.csv')

In addition, RDkit has a `MoleculeDescriptors` module that gives out about 200 descriptors

In [66]:
des_list = [x[0] for x in Descriptors._descList]
descriptors_dict = {'normalised_name': []}
descriptors_dict.update({name:[] for name in des_list})
descriptors_df_2 = pd.DataFrame(descriptors_dict)

def to_descriptors(smile):
    mol = Chem.MolFromSmiles(smile)
    descriptors = np.array([smile])
    
    if mol:
      calculator = MoleculeDescriptors.MolecularDescriptorCalculator(des_list)
      descriptors = np.append(descriptors, list(calculator.CalcDescriptors(mol)))
    else:
      empty_arr = np.empty([len(des_list),])
      empty_arr[:] = np.nan
      descriptors = np.append(descriptors, empty_arr)
    
    return pd.concat([descriptors_df_2, pd.DataFrame(dict(zip(['normalised_name'] + des_list, descriptors)), index=[0])], ignore_index=True)

In [67]:
for name in df_smiles['normalised_name']:
  descriptors_df_2 = to_descriptors(name)

[13:28:02] SMILES Parse Error: syntax error while parsing: [['Al',
[13:28:02] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[13:28:02] SMILES Parse Error: syntax error while parsing: [['Cn',
[13:28:02] SMILES Parse Error: Failed parsing SMILES '[['Cn',' for input: '[['Cn','
[13:28:02] SMILES Parse Error: syntax error while parsing: [['IL',
[13:28:02] SMILES Parse Error: Failed parsing SMILES '[['IL',' for input: '[['IL','
[13:28:02] SMILES Parse Error: syntax error while parsing: [['Al',
[13:28:02] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[13:28:02] SMILES Parse Error: syntax error while parsing: [['P',
[13:28:02] SMILES Parse Error: Failed parsing SMILES '[['P',' for input: '[['P','
[13:28:02] SMILES Parse Error: syntax error while parsing: [['B',
[13:28:02] SMILES Parse Error: Failed parsing SMILES '[['B',' for input: '[['B','
[13:28:02] SMILES Parse Error: syntax error while parsing: [['Eu',
[13:28:02] SMILES Parse Error: Fa

In [68]:
descriptors_df_2

Unnamed: 0,normalised_name,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,[O-][Si]([O-])([O-])[O-],8.57638888888889,-5.611111111111111,8.57638888888889,5.611111111111111,0.27918875136013765,92.082,92.082,91.95877932964001,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,[B-](CNS(=O)(=O)CC1=CC=C(C=C1)C(=O)O)(O)(O)O,11.514661438649533,-3.8627251039304613,11.514661438649533,0.025492593642395978,0.39094946276795617,290.082,276.97799999999995,290.05112673591,106.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O=[Si]=O,8.402777777777779,-1.4166666666666665,8.402777777777779,1.4166666666666665,0.3490914202027598,60.084,60.084,59.966755770000006,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CN(C)C=O,9.430555555555555,0.75,9.430555555555555,0.75,0.3902682398861675,73.095,66.03899999999999,73.052763844,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,F[Er](F)F,9.805555555555555,-4.638888888888889,9.805555555555555,4.638888888888889,0.5808883347829132,224.25399999999996,224.25399999999996,222.92550276,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1668,CC1(CCC(=CC2=CC=C(C=C2)Cl)C1(CN3C=NC=N3)O)C,11.383552514627251,-0.9302221513605446,11.383552514627251,0.21110242836846682,0.9412505692947135,317.82,297.66,317.12948994,116.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1669,"[['As', 0.937], ['Ga', 0.843], ['In', 0.156], ...",,,,,,,,,,...,,,,,,,,,,
1670,COC(C)(C)C,4.9375,0.04166666666666652,4.9375,0.04166666666666652,0.4349873803799564,88.14999999999999,76.05399999999999,88.088815004,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1671,[OH],0.0,0.0,0.0,0.0,0.376429063370404,17.007,15.999,17.002739652,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
descriptors_df = pd.merge(descriptors_df, descriptors_df_2, on='normalised_name')
descriptors_df = descriptors_df.drop_duplicates().reset_index(drop='True')
descriptors_df.to_csv('/content/descriptors_df.csv', index=False)

You can also use the PubChem API, but it only returns 39 descriptors

In [70]:
properties = ['MolecularFormula', 'MolecularWeight', 'CanonicalSMILES', 'IsomericSMILES', 'InChI', 'InChIKey', 'IUPACName', 'XLogP', 'ExactMass', 
              'MonoisotopicMass', 'TPSA', 'Complexity', 'Charge', 'HBondDonorCount', 'HBondAcceptorCount', 'RotatableBondCount', 'HeavyAtomCount', 
              'IsotopeAtomCount', 'AtomStereoCount', 'DefinedAtomStereoCount', 'UndefinedAtomStereoCount', 'BondStereoCount', 'DefinedBondStereoCount', 
              'UndefinedBondStereoCount', 'CovalentUnitCount', 'Volume3D', 'XStericQuadrupole3D', 'YStericQuadrupole3D', 'ZStericQuadrupole3D', 'FeatureCount3D', 
              'FeatureAcceptorCount3D', 'FeatureDonorCount3D', 'FeatureAnionCount3D', 'FeatureCationCount3D', 'FeatureRingCount3D', 'FeatureHydrophobeCount3D', 
              'ConformerModelRMSD3D', 'EffectiveRotorCount3D', 'ConformerCount3D']

descriptors_pub = pd.DataFrame()

for name in df_smiles['normalised_name']:
  try:
    p = pcp.get_properties(properties, name, 'smiles', as_dataframe=True)
    p['normalised_name'] = name
    descriptors_pub = pd.concat([descriptors_pub, p], ignore_index=True)
  except Exception:
    print(name)
  

[['Al', 1.0], ['N', 1.0]]
[['Cn', 1.0], ['H', 2.0]]
[['IL', 1.0], ['O', 1.0], ['ORM', 1.0], ['S', 1.0]]
[['Al', 1.0], ['B', 1.0], ['Bi', 1.0], ['Tb', 0.5], ['Zn', 1.0]]
[['P', 2.0], ['V', 1.0]]
[['B', 2.0], ['Eu', 2.0], ['F', 2.0], ['O', 8.0], ['Pb', 1.0], ['Te', 1.0]]
[['Eu', 1.0], ['Li', 1.0]]
[['Ge', 10.0], ['Se', 90.0]]
[['In', 1.0], ['P', 1.0]]
[['Ge', 3.0], ['O', 11.0], ['Pb', 5.0]]
[['In', 1.0], ['N', 1.0]]
[['Nd', 1.0], ['Pb', 4.0]]
[['Na', 2.0], ['O', 5.0], ['O-', 1.0], ['P', 2.0]]
[['Ga', 5.0], ['Ge', 25.0], ['S', 70.0]]
[['Nb', 2.0], ['O', 7.0], ['Si', 1.0]]
[['Ag', 1.0], ['Ga', 1.0], ['Ge', 1.0], ['Se', 4.0]]
[['B', 2.0], ['Nd', 2.0], ['O', 8.0], ['O.', 1.0], ['Te', 1.0], ['Zn', 1.0]]
[['Cd', 1.0], ['F', 1.0], ['Zn', 1.0]]
[['Bi', 2.0], ['Mo', 1.0], ['S', 5.0]]
[['Lu', 1.0], ['PcR', 16.0]]
[['IT', 1.0], ['O', 1.0]]
[['MgA', 1.0], ['S', 2020.0]]
[['C', 60.0], ['Po-', 1.0], ['Zn', 1.0]]
[['Cd', 0.33], ['S', 0.4], ['Se', 0.6], ['Zn', 0.77]]
[['Al', 3.0], ['B', 4.0], ['Erx', 1.

In [71]:
descriptors_df = pd.merge(descriptors_df, descriptors_pub, on='normalised_name', how='outer')
descriptors_df = descriptors_df.drop_duplicates().reset_index(drop=True)
descriptors_df.to_csv('/content/descriptors_df.csv', index=False)

In [13]:
descriptors_df = pd.read_csv('descriptors_df.csv')

Another way to get a lot of descriptors is through the `mordred` library, which gives about 1600 descriptors.

In [74]:
calc = Calculator(descriptors, ignore_3D=True)

def to_mol(smile):
  mol = Chem.MolFromSmiles(smile)
  if mol:
    return mol


mols_full = [to_mol(smi) for smi in df_smiles['normalised_name']]
mols = [i for i in mols_full if i is not None]
descriptors_mor = calc.pandas(mols)
descriptors_mor

[14:25:19] SMILES Parse Error: syntax error while parsing: [['Al',
[14:25:19] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[14:25:19] SMILES Parse Error: syntax error while parsing: [['Cn',
[14:25:19] SMILES Parse Error: Failed parsing SMILES '[['Cn',' for input: '[['Cn','
[14:25:19] SMILES Parse Error: syntax error while parsing: [['IL',
[14:25:19] SMILES Parse Error: Failed parsing SMILES '[['IL',' for input: '[['IL','
[14:25:19] SMILES Parse Error: syntax error while parsing: [['Al',
[14:25:19] SMILES Parse Error: Failed parsing SMILES '[['Al',' for input: '[['Al','
[14:25:19] SMILES Parse Error: syntax error while parsing: [['P',
[14:25:19] SMILES Parse Error: Failed parsing SMILES '[['P',' for input: '[['P','
[14:25:19] SMILES Parse Error: syntax error while parsing: [['B',
[14:25:19] SMILES Parse Error: Failed parsing SMILES '[['B',' for input: '[['B','
[14:25:19] SMILES Parse Error: syntax error while parsing: [['Eu',
[14:25:19] SMILES Parse Error: Fa

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|█▍        | 187/1293 [00:29<02:22,  7.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1293/1293 [13:32<00:00,  1.59it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,3.464102,3.464102,4,0,4.0,2.0,4.0,4.0,0.8,2.444466,...,7.625107,29.418928,91.958779,18.391756,16,0,20.0,16.0,4.0625,1.000000
1,14.407961,12.560168,2,0,21.47108,2.320638,4.641275,21.47108,1.130057,3.834487,...,9.651108,51.914752,290.051127,9.064098,796,23,94.0,101.0,9.208333,3.986111
2,1.414214,1.414214,0,0,2.828427,1.414214,2.828427,2.828427,0.942809,1.849457,...,4.174387,17.310771,59.966756,19.988919,4,0,6.0,4.0,2.25,1.000000
3,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.254130,73.052764,6.087730,18,2,16.0,14.0,3.361111,1.333333
4,2.449490,2.449490,0,0,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,222.925503,55.731376,9,0,12.0,9.0,3.111111,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,2.828427,3.146264,0,0,5.464102,1.732051,3.464102,5.464102,1.09282,2.390167,...,6.192362,25.583106,72.057515,5.542886,20,2,14.0,12.0,2.75,1.500000
1289,17.400365,15.308066,0,0,27.767969,2.547168,4.96715,27.767969,1.26218,4.036327,...,10.180324,72.619295,317.129490,7.550702,1034,34,120.0,143.0,7.569444,4.645833
1290,4.012290,4.284816,0,0,5.818626,2.074313,4.148627,5.818626,0.969771,2.626363,...,7.989899,31.665095,88.088815,4.893823,28,3,24.0,22.0,4.3125,1.375000
1291,0.000000,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,0.693147,...,0.000000,1.000000,17.002740,8.501370,0,0,0.0,0.0,divide by zero encountered in power (mZagreb1),0.000000


In [1]:
# cols_to_use = descriptors_mor.columns.difference(descriptors_df.columns)
# df_5 = pd.merge(descriptors_df, descriptors_mor[cols_to_use], on='normalised_name', how='outer')
# df_5 = df_5.drop_duplicates().reset_index(drop=True)
# df_5
# # df_5.to_csv('/content/df_3.csv', index=False)

In [75]:
df_n = df_smiles['normalised_name'].reset_index(drop=True)
df_n = df_n[~pd.Series(mols_full).isna()].reset_index(drop=True)
descriptors_mor['normalised_name'] = df_n

In [76]:
descriptors_mor

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,normalised_name
0,3.464102,3.464102,4,0,4.0,2.0,4.0,4.0,0.8,2.444466,...,29.418928,91.958779,18.391756,16,0,20.0,16.0,4.0625,1.000000,[O-][Si]([O-])([O-])[O-]
1,14.407961,12.560168,2,0,21.47108,2.320638,4.641275,21.47108,1.130057,3.834487,...,51.914752,290.051127,9.064098,796,23,94.0,101.0,9.208333,3.986111,[B-](CNS(=O)(=O)CC1=CC=C(C=C1)C(=O)O)(O)(O)O
2,1.414214,1.414214,0,0,2.828427,1.414214,2.828427,2.828427,0.942809,1.849457,...,17.310771,59.966756,19.988919,4,0,6.0,4.0,2.25,1.000000,O=[Si]=O
3,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,27.254130,73.052764,6.087730,18,2,16.0,14.0,3.361111,1.333333,CN(C)C=O
4,2.449490,2.449490,0,0,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,24.179697,222.925503,55.731376,9,0,12.0,9.0,3.111111,1.000000,F[Er](F)F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,2.828427,3.146264,0,0,5.464102,1.732051,3.464102,5.464102,1.09282,2.390167,...,25.583106,72.057515,5.542886,20,2,14.0,12.0,2.75,1.500000,CCCC=O
1289,17.400365,15.308066,0,0,27.767969,2.547168,4.96715,27.767969,1.26218,4.036327,...,72.619295,317.129490,7.550702,1034,34,120.0,143.0,7.569444,4.645833,CC1(CCC(=CC2=CC=C(C=C2)Cl)C1(CN3C=NC=N3)O)C
1290,4.012290,4.284816,0,0,5.818626,2.074313,4.148627,5.818626,0.969771,2.626363,...,31.665095,88.088815,4.893823,28,3,24.0,22.0,4.3125,1.375000,COC(C)(C)C
1291,0.000000,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,0.693147,...,1.000000,17.002740,8.501370,0,0,0.0,0.0,divide by zero encountered in power (mZagreb1),0.000000,[OH]


In [77]:
descriptors_mor.to_csv('descriptors_mor.csv', index=False)