# Hometask №1
### Data processing & descriptors search

In [19]:
import pandas as pd

import numpy as np

import re

import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

import cirpy

import pubchempy as pcp

from mp_api.client import MPRester

from bs4 import BeautifulSoup
import requests

from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper import descriptors

### Dataset

In [70]:
data = pd.read_csv("C:\\Users\\User\Downloads\\1.csv", sep=',')
data

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/j.physb.2010.04.052,4/29/2010,Physica B: Condensed Matter,EXCITONSPECTRAENERGYBANDSTRUCTURECUALS2CRYSTALS,CuGaSe2,0.0,,el_cde_tables,"[['Cu', 1.0], ['Ga', 1.0], ['Se', 2.0]]",1.8235,n
1,10.1016/j.jep.2012.08.025,8/29/2012,Journal of Ethnopharmacology,INVITROINVIVOANTIMICROBIALACTIVITYALGERIANHOGG...,WC,0.0,,el_cde_tables,,2.66,n
2,10.1016/j.physb.2011.08.013,8/18/2011,Physica B: Condensed Matter,OPTICALPROPERTIESGEASTETHINFILMS,Te–Te,0.0,,el_mylogic,,2.99,Refractive index parameters
3,10.1016/j.jnoncrysol.2011.04.001,5/3/2011,Journal of Non-Crystalline Solids,FOURIERTRANSFORMINFRAREDSPECTROSCOPYINVESTIGAT...,SiC,0.0,,el_mylogic,,2.55,RI (± 0.0001)
4,10.1039/C4PY00369A,8/12/2014,Polymer Chemistry,Camptothecin prodrug block copolymer micelles ...,CPT,0.0,,rsc_cde_text,,3,n
...,...,...,...,...,...,...,...,...,...,...,...
4995,10.1016/j.ejpb.2004.05.003,6/26/2004,European Journal of Pharmaceutics and Biopharm...,APPLYINGPATTERNRECOGNITIONMETHODSSTRUCTUREPROP...,Dihydropyridine,0.0,,el_mylogic,,1.679,Index of refraction
4996,10.1039/C4CS00424H,,,,TiO2,0.0,,snowball,"[['O', 2.0], ['Ti', 1.0]]",2.5,refractive index
4997,10.1016/j.numecd.2015.10.010,11/14/2015,"Nutrition, Metabolism and Cardiovascular Diseases",INCREASINGFRUITVEGETABLEINTAKENOEFFECTRETINALV...,cholesterol,0.0,,el_cde_tables,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C...,4.28,n
4998,10.1016/j.solmat.2010.03.020,4/1/2010,Solar Energy Materials and Solar Cells,DISCUSSIONELECTRICALCHARACTERISTICSIIN013GA087...,SiO2,0.0,,el_mylogic,,1.467,Refractive index


In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DOI                     5000 non-null   object 
 1   Date                    4555 non-null   object 
 2   Journal                 4555 non-null   object 
 3   Title                   4555 non-null   object 
 4   Name                    4992 non-null   object 
 5   measurement_error       5000 non-null   float64
 6   measurement_wavelength  597 non-null    object 
 7   measurement_method      5000 non-null   object 
 8   normalised_name         2994 non-null   object 
 9   raw_value               5000 non-null   object 
 10  specifier               5000 non-null   object 
dtypes: float64(1), object(10)
memory usage: 429.8+ KB


### Handling duplicates

In [72]:
# number of duplicated rows
data.duplicated().sum()

22

In [73]:
# delete duplicates
data = data.drop_duplicates()
data.shape

(4978, 11)

### Handling missing values

In [74]:
# % of missing values for every column
data.isnull().sum() / data.shape[0] * 100

DOI                        0.000000
Date                       8.919245
Journal                    8.919245
Title                      8.919245
Name                       0.160707
measurement_error          0.000000
measurement_wavelength    88.027320
measurement_method         0.000000
normalised_name           40.116513
raw_value                  0.000000
specifier                  0.000000
dtype: float64

In [75]:
# fill in gaps in "Name" column
for idx in data.index:
    if type(data.loc[idx, 'Name'])!=str:
        norm_name = data.loc[idx, 'normalised_name']
        new_name = cirpy.resolve(norm_name, 'iupac_name')
        data.at[idx, 'Name'] = new_name
len(data['Name']) # check how many names we managed to obtain

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4978 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DOI                     4978 non-null   object 
 1   Date                    4534 non-null   object 
 2   Journal                 4534 non-null   object 
 3   Title                   4534 non-null   object 
 4   Name                    4978 non-null   object 
 5   measurement_error       4978 non-null   float64
 6   measurement_wavelength  596 non-null    object 
 7   measurement_method      4978 non-null   object 
 8   normalised_name         2981 non-null   object 
 9   raw_value               4978 non-null   object 
 10  specifier               4978 non-null   object 
dtypes: float64(1), object(10)
memory usage: 595.7+ KB


In [77]:
# fill in gaps in "normalised_name" column + convert everything to SMILES format
for idx in data.index:
    name = data.loc[idx, 'Name']
    try:
        try:
            smiles = pcp.get_compounds(name, 'formula')[0].isomeric_smiles
        except:
            pass
    except:
        try:
            smiles = pcp.get_compounds(name, 'name')[0].isomeric_smiles
        except:
            pass
    data.at[idx, 'normalised_name'] = smiles
data.info() # check how many SMILES we obtained

In [None]:
# check if all the SMILES are valid (invalid SMILES can not be handled by RDKit)
for smiles in data['normalised_name'].tolist():
    mol = Chem.MolFromSmiles(smiles)
    if mol == None:
        print(f'Invalid smiles: {smiles}')

In [10]:
data

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/j.physb.2010.04.052,4/29/2010,Physica B: Condensed Matter,EXCITONSPECTRAENERGYBANDSTRUCTURECUALS2CRYSTALS,CuGaSe2,0.0,,el_cde_tables,,1.8235,n
1,10.1016/j.jep.2012.08.025,8/29/2012,Journal of Ethnopharmacology,INVITROINVIVOANTIMICROBIALACTIVITYALGERIANHOGG...,WC,0.0,,el_cde_tables,[CH2][W],2.66,n
2,10.1016/j.physb.2011.08.013,8/18/2011,Physica B: Condensed Matter,OPTICALPROPERTIESGEASTETHINFILMS,Te–Te,0.0,,el_mylogic,,2.99,Refractive index parameters
3,10.1016/j.jnoncrysol.2011.04.001,5/3/2011,Journal of Non-Crystalline Solids,FOURIERTRANSFORMINFRAREDSPECTROSCOPYINVESTIGAT...,SiC,0.0,,el_mylogic,C=[SiH2],2.55,RI (± 0.0001)
4,10.1039/C4PY00369A,8/12/2014,Polymer Chemistry,Camptothecin prodrug block copolymer micelles ...,CPT,0.0,,rsc_cde_text,[NH3-].[NH3-].[Pt++],3,n
...,...,...,...,...,...,...,...,...,...,...,...
4995,10.1016/j.ejpb.2004.05.003,6/26/2004,European Journal of Pharmaceutics and Biopharm...,APPLYINGPATTERNRECOGNITIONMETHODSSTRUCTUREPROP...,Dihydropyridine,0.0,,el_mylogic,C1NC=CC=C1,1.679,Index of refraction
4996,10.1039/C4CS00424H,,,,TiO2,0.0,,snowball,O=[Ti]=O,2.5,refractive index
4997,10.1016/j.numecd.2015.10.010,11/14/2015,"Nutrition, Metabolism and Cardiovascular Diseases",INCREASINGFRUITVEGETABLEINTAKENOEFFECTRETINALV...,cholesterol,0.0,,el_cde_tables,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C...,4.28,n
4998,10.1016/j.solmat.2010.03.020,4/1/2010,Solar Energy Materials and Solar Cells,DISCUSSIONELECTRICALCHARACTERISTICSIIN013GA087...,SiO2,0.0,,el_mylogic,,1.467,Refractive index


### Fixing the DOI column

To separate the column with DOI we applied regular expressions (regex)

In [45]:
regex = (r'((10\.\d{4,9}/[A-Z0-9]+[A-Z])|(10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+[0-9]))')
data_doi = data['DOI'].str.extract(regex)
data_doi = data_doi.drop(columns = [1,2])
data = pd.concat([data_doi, data], axis = 1).rename(columns = {0:'doi'}).drop(columns = "DOI")
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4978 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   doi                     4978 non-null   object 
 1   Date                    4534 non-null   object 
 2   Journal                 4534 non-null   object 
 3   Title                   4534 non-null   object 
 4   Name                    4970 non-null   object 
 5   measurement_error       4978 non-null   float64
 6   measurement_wavelength  596 non-null    object 
 7   measurement_method      4978 non-null   object 
 8   normalised_name         2981 non-null   object 
 9   raw_value               4978 non-null   object 
 10  specifier               4978 non-null   object 
dtypes: float64(1), object(10)
memory usage: 466.7+ KB


In [51]:
# parse titles of articles from GoogleScholar by their doi
for idx in data.index:
    title = data.loc[idx, 'Title']
    doi = data.loc[idx, 'doi']
    if type(title)!=str:
        doi = doi.replace('/', '%2F')
        url = f'https://scholar.google.ru/scholar?hl=ru&as_sdt=0%2C5&q={doi}&btnG='
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.find('h3', class_='gs_rt')
            data.loc[idx, 'Title'] = title.text
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4978 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   doi                     4978 non-null   object 
 1   Date                    4534 non-null   object 
 2   Journal                 4534 non-null   object 
 3   Title                   4534 non-null   object 
 4   Name                    4970 non-null   object 
 5   measurement_error       4978 non-null   float64
 6   measurement_wavelength  596 non-null    object 
 7   measurement_method      4978 non-null   object 
 8   normalised_name         2981 non-null   object 
 9   raw_value               4978 non-null   object 
 10  specifier               4978 non-null   object 
dtypes: float64(1), object(10)
memory usage: 595.7+ KB


### Descriptors search

In [None]:
# 200 descriptors from RDKit
for smiles in data['normalised_name'].tolist():
  descriptors_list = []
  if type(smiles)==str:
    mol = Chem.MolFromSmiles(smiles)
    if mol!=None:
      descriptors_list = [Descriptors.ExactMolWt(mol),
                            Descriptors.HeavyAtomMolWt(mol),
                            Descriptors.MaxPartialCharge(mol),
                            Descriptors.MinPartialCharge(mol),
                            Descriptors.NumRadicalElectrons(mol),
                            Descriptors.NumValenceElectrons(mol),
                            Descriptors.MolMR(mol),
                            Descriptors.MolLogP(mol),
                            Descriptors.NumHAcceptors(mol),
                            Descriptors.NumHDonors(mol),
                            Descriptors.HeavyAtomCount(mol),
                            Descriptors.RingCount(mol),
                            Descriptors.FractionCSP3(mol),
                            Descriptors.TPSA(mol),
                            Descriptors.LabuteASA(mol),
                            Descriptors.PEOE_VSA1(mol),
                            Descriptors.PEOE_VSA2(mol),
                            Descriptors.PEOE_VSA3(mol),
                            Descriptors.PEOE_VSA4(mol),
                            Descriptors.PEOE_VSA5(mol),
                            Descriptors.PEOE_VSA6(mol),
                            Descriptors.PEOE_VSA7(mol),
                            Descriptors.PEOE_VSA8(mol),
                            Descriptors.PEOE_VSA9(mol),
                            Descriptors.PEOE_VSA10(mol),
                            Descriptors.PEOE_VSA11(mol),
                            Descriptors.PEOE_VSA12(mol),
                            Descriptors.PEOE_VSA13(mol),
                            Descriptors.PEOE_VSA14(mol),
                            Descriptors.SMR_VSA1(mol),
                            Descriptors.SMR_VSA2(mol),
                            Descriptors.SMR_VSA3(mol),
                            Descriptors.SMR_VSA4(mol),
                            Descriptors.SMR_VSA5(mol),
                            Descriptors.SMR_VSA6(mol),
                            Descriptors.SMR_VSA7(mol),
                            Descriptors.SMR_VSA8(mol),
                            Descriptors.SMR_VSA9(mol),
                            Descriptors.SMR_VSA10(mol),
                            Descriptors.SlogP_VSA1(mol),
                            Descriptors.SlogP_VSA2(mol),
                            Descriptors.SlogP_VSA3(mol),
                            Descriptors.SlogP_VSA4(mol),
                            Descriptors.SlogP_VSA5(mol),
                            Descriptors.SlogP_VSA6(mol),
                            Descriptors.SlogP_VSA7(mol),
                            Descriptors.SlogP_VSA8(mol),
                            Descriptors.SlogP_VSA9(mol),
                            Descriptors.SlogP_VSA10(mol),
                            Descriptors.SlogP_VSA11(mol),
                            Descriptors.SlogP_VSA12(mol),
                            Descriptors.VSA_EState1(mol),
                            Descriptors.VSA_EState2(mol),
                            Descriptors.VSA_EState3(mol),
                            Descriptors.VSA_EState4(mol),
                            Descriptors.VSA_EState5(mol),
                            Descriptors.VSA_EState6(mol),
                            Descriptors.VSA_EState7(mol),
                            Descriptors.VSA_EState8(mol),
                            Descriptors.VSA_EState9(mol),
                            Descriptors.VSA_EState10(mol),
                            rdMolDescriptors._CalcCrippenContribs(mol),
                            #Descriptors.MQN(mol),
                            #Descriptors.Autocorr2D(mol),
                            #Descriptors.BCUT2D(mol),
                            #Descriptors.Phi(mol),
                            Descriptors.NumHeteroatoms(mol),
                            Descriptors.NumRotatableBonds(mol),
                            Descriptors.HallKierAlpha(mol),
                            Descriptors.Ipc(mol),
                            Descriptors.BertzCT(mol),
                            Descriptors.BalabanJ(mol),
                            Descriptors.Kappa1(mol),
                            Descriptors.Kappa2(mol),
                            Descriptors.Kappa3(mol),
                            Descriptors.Chi0(mol),
                            Descriptors.Chi1(mol),
                            Descriptors.Chi0n(mol),
                            Descriptors.Chi1n(mol),
                            Descriptors.Chi2n(mol),
                            Descriptors.Chi3n(mol),
                            Descriptors.Chi4n(mol),
                            Descriptors.Chi0v(mol),
                            Descriptors.Chi1v(mol),
                            Descriptors.Chi2v(mol),
                            Descriptors.Chi3v(mol),
                            Descriptors.Chi4v(mol),
                            #Descriptors.NumAmideBonds(mol),
                            Descriptors.NHOHCount(mol),
                            Descriptors.NOCount(mol),
                            Descriptors.NumAromaticRings(mol),
                            Descriptors.NumSaturatedRings(mol),
                            Descriptors.NumAliphaticRings(mol),
                            Descriptors.NumAromaticHeterocycles(mol),
                            Descriptors.NumAromaticCarbocycles(mol),
                            Descriptors.NumSaturatedHeterocycles(mol),
                            Descriptors.NumSaturatedCarbocycles(mol),
                            Descriptors.NumAliphaticHeterocycles(mol),
                            Descriptors.NumAliphaticCarbocycles(mol),
                            #Descriptors.NumSpiroAtoms(mol),
                            #Descriptors.NumBridgeheadAtoms(mol),
                            #Descriptors3D.PMI1(mol),
                            #Descriptors3D.PMI2(mol),
                            #Descriptors3D.PMI3(mol),
                            #Descriptors3D.NPR1(mol),
                            #Descriptors3D.NPR2(mol),
                            #Descriptors3D.Eccentricity(mol),
                            #Descriptors3D.Asphericity(mol),
                            #Descriptors3D.SpherocityIndex(mol),
                            #Descriptors3D.RadiusOfGyration(mol),
                            #Descriptors3D.InertialShapeFactor(mol),
                            Descriptors.MaxAbsPartialCharge(mol),
                            Descriptors.MinAbsPartialCharge(mol),
                            Descriptors.MinAbsEStateIndex(mol),
                            Descriptors.MaxAbsEStateIndex(mol),
                            Descriptors.MinEStateIndex(mol),
                            Descriptors.MaxEStateIndex(mol),
                            len(mol.GetAtoms()),
                            len(mol.GetBonds()),
                            len(Chem.GetSSSR(mol)),

      ]
      print(len(descriptors_list))
      break

In [None]:
# 20 descriptors from PubChem
for smiles in data['smiles'].tolist(): ###19
  descriptors_list = []
  if type(smiles)==str:
    for property in ['MonoisotopicMass',
                        'Complexity',
                        'IsotopeAtomCount',
                        'AtomStereoCount',
                        'DefinedAtomStereoCount',
                        'UndefinedAtomStereoCount'
                        'BondStereoCount',
                        'DefinedBondStereoCount',
                        'UndefinedBondStereoCount',
                        'CovalentUnitCount',
                        'Volume3D',
                        'XStericQuadrupole3D',
                        'YStericQuadrupole3D',
                        'ZStericQuadrupole3D',
                        'FeatureCount3D',
                        'FeatureRingCount3D',
                        'FeatureHydrophobeCount3D',
                        'ConformerModelRMSD3D',
                        'EffectiveRotorCount3D'
                        'ConformerCount3D',
                        'Charge']:
      try:
        p = pcp.get_properties(property, smiles, 'smiles')
        if property in p[0].keys():
          print(p[0][property])
        else:
          print(None)
      except:
        print(None)
    break

In [None]:
# 50 descriptors from Materials Project
with MPRester("VB1JmIovB0FutOai3Xb1ItnXhy9OkIDk") as mpr: ###53
  docs = mpr.summary.search(formula="CO", fields=['nsites', 'nelements', 'chemsys', 'volume', 'density', 'density_atomic', 'symmetry', 'deprecated', 'deprecation_reasons', 'uncorrected_energy_per_atom', 'energy_per_atom', 'formation_energy_per_atom', 'energy_above_hull', 'is_stable', 'equilibrium_reaction_energy_per_atom', 'xas', 'grain_boundaries', 'band_gap', 'cbm', 'vbm', 'efermi', 'is_gap_direct', 'is_metal', 'dos_energy_up', 'dos_energy_down', 'is_magnetic', 'ordering', 'total_magnetization', 'total_magnetization_normalized_vol', 'total_magnetization_normalized_formula_units', 'num_magnetic_sites', 'num_unique_magnetic_sites', 'types_of_magnetic_species', 'k_voigt', 'k_reuss', 'k_vrh', 'g_voigt', 'g_reuss', 'g_vrh', 'universal_anisotropy', 'homogeneous_poisson', 'e_total', 'e_ionic', 'e_electronic', 'n', 'e_ij_max', 'weighted_surface_energy_EV_PER_ANG2', 'weighted_surface_energy', 'weighted_work_function', 'surface_anisotropy', 'shape_factor', 'has_reconstructed', 'theoretical'])
  print(docs[0])

In [None]:
padel = PaDEL(descriptors)
for smiles in data['normalised_name'].tolist():
  descriptors_list = []
  if type(smiles)==str:
    mol = Chem.MolFromSmiles(smiles)
    if mol!=None:
      padel.calculate(mol)