# Hometask №1
### Data processing & descriptors search

In [18]:
import pandas as pd

import numpy as np

import re

import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors3D
from rdkit.Chem import rdDepictor
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*') # supress rdkit warnings

import cirpy

import pubchempy as pcp

from mp_api.client import MPRester

from bs4 import BeautifulSoup
import requests

from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper import descriptors
from PaDEL_pywrapper.descriptor import AtomCount, BondCount, Constitutional, EStateAtomType, ExtendedTopochemicalAtom, MDE, MLFER, RotatableBondsCount, CarbonTypes, HBondAcceptorCount, HBondDonorCount, PathCount

import string

### Dataset

In [3]:
data = pd.read_csv("C:\\Users\\User\Downloads\\1.csv", sep=',')
data

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/j.physb.2010.04.052,4/29/2010,Physica B: Condensed Matter,EXCITONSPECTRAENERGYBANDSTRUCTURECUALS2CRYSTALS,CuGaSe2,0.0,,el_cde_tables,"[['Cu', 1.0], ['Ga', 1.0], ['Se', 2.0]]",1.8235,n
1,10.1016/j.jep.2012.08.025,8/29/2012,Journal of Ethnopharmacology,INVITROINVIVOANTIMICROBIALACTIVITYALGERIANHOGG...,WC,0.0,,el_cde_tables,,2.66,n
2,10.1016/j.physb.2011.08.013,8/18/2011,Physica B: Condensed Matter,OPTICALPROPERTIESGEASTETHINFILMS,Te–Te,0.0,,el_mylogic,,2.99,Refractive index parameters
3,10.1016/j.jnoncrysol.2011.04.001,5/3/2011,Journal of Non-Crystalline Solids,FOURIERTRANSFORMINFRAREDSPECTROSCOPYINVESTIGAT...,SiC,0.0,,el_mylogic,,2.55,RI (± 0.0001)
4,10.1039/C4PY00369A,8/12/2014,Polymer Chemistry,Camptothecin prodrug block copolymer micelles ...,CPT,0.0,,rsc_cde_text,,3,n
...,...,...,...,...,...,...,...,...,...,...,...
4995,10.1016/j.ejpb.2004.05.003,6/26/2004,European Journal of Pharmaceutics and Biopharm...,APPLYINGPATTERNRECOGNITIONMETHODSSTRUCTUREPROP...,Dihydropyridine,0.0,,el_mylogic,,1.679,Index of refraction
4996,10.1039/C4CS00424H,,,,TiO2,0.0,,snowball,"[['O', 2.0], ['Ti', 1.0]]",2.5,refractive index
4997,10.1016/j.numecd.2015.10.010,11/14/2015,"Nutrition, Metabolism and Cardiovascular Diseases",INCREASINGFRUITVEGETABLEINTAKENOEFFECTRETINALV...,cholesterol,0.0,,el_cde_tables,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C...,4.28,n
4998,10.1016/j.solmat.2010.03.020,4/1/2010,Solar Energy Materials and Solar Cells,DISCUSSIONELECTRICALCHARACTERISTICSIIN013GA087...,SiO2,0.0,,el_mylogic,,1.467,Refractive index


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DOI                     5000 non-null   object 
 1   Date                    4555 non-null   object 
 2   Journal                 4555 non-null   object 
 3   Title                   4555 non-null   object 
 4   Name                    4992 non-null   object 
 5   measurement_error       5000 non-null   float64
 6   measurement_wavelength  597 non-null    object 
 7   measurement_method      5000 non-null   object 
 8   normalised_name         2994 non-null   object 
 9   raw_value               5000 non-null   object 
 10  specifier               5000 non-null   object 
dtypes: float64(1), object(10)
memory usage: 429.8+ KB


The following cleansing aspects are needed:
- extract journal names and years from DOI column
- find missing Titles
- find missing Names
- find missing normalised_names and uniform them
- uniform raw_value

### Handling duplicates

In [4]:
# number of duplicated rows
data.duplicated().sum()

22

In [5]:
# delete duplicates
data = data.drop_duplicates()
data.shape

(4978, 11)

### Handling missing values

In [6]:
# % of missing values for every column
data.isnull().sum() / data.shape[0] * 100

DOI                        0.000000
Date                       8.919245
Journal                    8.919245
Title                      8.919245
Name                       0.160707
measurement_error          0.000000
measurement_wavelength    88.027320
measurement_method         0.000000
normalised_name           40.116513
raw_value                  0.000000
specifier                  0.000000
dtype: float64

In [7]:
data_no_name = data[data['Name'].isna()==True]
data_no_name['normalised_name']

9                CCCO
3238        CCOC(C)=O
3266        Cc1ccccc1
3468      CCCCC(CC)CO
3761    Nc1cccc(Cl)c1
4026         O=[Si]=O
4230          CC(C)=O
4233        Nc1ccccc1
Name: normalised_name, dtype: object

We see that all missing names have SMILES in "normalised_name" column. So, we can obtain chemical formulas by SMILES with the help of CIRpy library.

In [10]:
# fill in gaps in "Name" column
for idx in data.index:
    if type(data.loc[idx, 'Name'])!=str:
        norm_name = data.loc[idx, 'normalised_name']
        new_name = cirpy.resolve(norm_name, 'formula')
        data.at[idx, 'Name'] = new_name
        print(norm_name, new_name)

CCCO C3H8O
CCOC(C)=O C4H8O2
Cc1ccccc1 C7H8
CCCCC(CC)CO C8H18O
Nc1cccc(Cl)c1 C6H6ClN
O=[Si]=O O2Si
CC(C)=O C3H6O
Nc1ccccc1 C6H7N


Now we have all names and we can fill in missing values in "normalised_name" column with the help of PubChem API. Even if "normalised_name" already exists we parse the line through the function, as we want all "normalised_name" values to be in SMILES format.

In [77]:
# fill in gaps in "normalised_name" column + convert everything to SMILES format
for idx in data.index:
    name = data.loc[idx, 'Name']
    try:
        smiles = pcp.get_compounds(name, 'formula')[0].isomeric_smiles
        data.at[idx, 'normalised_name'] = smiles
    except:
        try:
            smiles = pcp.get_compounds(name, 'name')[0].isomeric_smiles
            data.at[idx, 'normalised_name'] = smiles
        except:
            data.at[idx, 'normalised_name'] = np.nan

In [15]:
# result dataset (programming was made by supercomputer)
data = pd.read_csv("C:\\Users\\User\Downloads\\data.csv", sep=',')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978 entries, 0 to 4977
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DOI                     4978 non-null   object 
 1   Date                    4534 non-null   object 
 2   Journal                 4534 non-null   object 
 3   Title                   4534 non-null   object 
 4   Name                    4978 non-null   object 
 5   measurement_error       4978 non-null   float64
 6   measurement_wavelength  596 non-null    object 
 7   measurement_method      4978 non-null   object 
 8   normalised_name         3995 non-null   object 
 9   raw_value               4978 non-null   object 
 10  specifier               4978 non-null   object 
dtypes: float64(1), object(10)
memory usage: 427.9+ KB


We see that some of the SMILES were not found by PubChem. We can try to find them with CIRpy package.

In [None]:
for idx in data.index:
    smiles = data.loc[idx, 'normalised_name']
    name = data.loc[idx, 'Name']
    if type(smiles)!=str:
        try:
            new_smiles = cirpy.resolve(name, 'smiles')
        except:
            new_smiles = None
        data.at[idx, 'normalised_name'] = new_smiles

In [2]:
# result dataset (programming was made by supercomputer)
data = pd.read_csv("C:\\Users\\User\Downloads\\data_1.csv", sep=',')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978 entries, 0 to 4977
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DOI                     4978 non-null   object 
 1   Date                    4534 non-null   object 
 2   Journal                 4534 non-null   object 
 3   Title                   4534 non-null   object 
 4   Name                    4978 non-null   object 
 5   measurement_error       4978 non-null   float64
 6   measurement_wavelength  596 non-null    object 
 7   measurement_method      4978 non-null   object 
 8   normalised_name         4368 non-null   object 
 9   raw_value               4978 non-null   object 
 10  specifier               4978 non-null   object 
dtypes: float64(1), object(10)
memory usage: 427.9+ KB


There are still missing SMILES. Among them we created a dictionary of names that can be eather corrected or converted to SMILES separately. Other names are deleted because they can not be converted into SMILES unambigously (ex. ...).

In [15]:
d_correct = {'As40Se60':'As2Se3',
             'NMFA':'N-methyl-2-fluoroaniline',
             'BiZnBo–':'BiZnBO',
             'Si–O':'Si–O',
             'Li2O–GeO2':'Li2O–GeO2',
             'GeO2–PbO':'GeO2–PbO',
             "TiO2's":'TiO2',
             'ZnO–Ta2O5–B2O3':'ZnO–Ta2O5–B2O3',
             'CaO–SiO2':'CaO–SiO2',
             'GeO2–Bi2O3':'GeO2–Bi2O3',
             'Na2O–B2O3':'Na2O–B2O3',
             'K2O–TeO2':'K2O–TeO2',
             'CH3–CH2–O–CH2–CH3':'diethyl ether',
             'MgO–PbF2–SiO2':'MgO–PbF2–SiO2',
             'O–O':'O2',
             'F–B':'F–B',
             'PbO–Al2O3':'PbO–Al2O3',
             'ZnO–Al2O3':'ZnO–Al2O3',
             '(−)–Sparteine':'Sparteine',
             'PbF2–TeO2–B2O3–Eu2O3':'PbF2–TeO2–B2O3–Eu2O3',
             'CaC2':'Calcium Carbide',
             'SiO2 – SnO2':'SiO2–SnO2',
             'Ag–ZnO':'Ag–ZnO',
             'ZnO –':'ZnO',
             'La2O3–B2O3':'La2O3–B2O3',
             'PbO–Sb2O3–B2O3':'PbO–Sb2O3–B2O3',
             'BPA–':'BPA',
             'Na2O–SiO2':'Na2O–SiO2',
             'SiO2(OH)':'HO3Si',
             'TeO2–':'TeO2',
             'CH3(CH2)3OH':'butanol',
             'TiO2(l)':'TiO2',
             'Al(PO3)3':'AlP3O9',
             'AgI–B2O3:V2O5':'AgI–B2O3–V2O5',
             'CaF2–Y2O3–ZnO':'CaF2–Y2O3–ZnO',
             'Sb2O3–Na2O':'Sb2O3–Na2O',
             'Te–Te':'Te–Te',
             'TeO2–BaF2':'TeO2–BaF2',
             'SiO2–':'SiO2',
             'K2O–B2O3':'K2O–B2O3',
             'Ge–Ga–S':'Ge–Ga–S',
             'NO3−':'nitrate',
             'CuSO4·H2O':'copper sulfate monohydrate',
             'Li2O–Y2O3':'Li2O–Y2O3',
             'PbF2–TeO2–B2O3–Sm2O3':'PbF2–TeO2–B2O3–Sm2O3',
             'CuSO4·5H2O':'copper sulfate pentahydrate',
             'Na2CO3_aq_23':'Na2CO3',
             'GeO2–SiO2':'GeO2–SiO2',
             'GeS2–As2S3':'GeS2–As2S3',
             'TeO2–B2O3–ZnO':'TeO2–B2O3–ZnO',
             'PbO–Bi2O3':'PbO–Bi2O3',
             'ZnSb(wt':'ZnSb',
             'K2O–B2O3–Bi2O3':'K2O–B2O3–Bi2O3',
             'In2O3–Sc2O3':'In2O3–Sc2O3',
             'ZnO–B2O3':'ZnO–B2O3',
             'TiO2–TeO2':'TiO2–TeO2',
             'V2O5–GeO2':'V2O5–GeO2',
             'Bi2O3–SiO2':'Bi2O3–SiO2',
             'SiO2–Nb2O5':'SiO2–Nb2O5',
             'B2O3–SiO2':'B2O3–SiO2',
             'SiC(100)':'SiC',
             'SiO2–ZrO2':'SiO2–ZrO2',
             'MgO(1 0 0 )':'MgO',
             'BaF2–PrF3':'BaF2–PrF3',
             'PbO−PbF2−B2O3–Nd2O3':'PbO−PbF2−B2O3–Nd2O3',
             'GeS2–Ga2S3–CdS':'GeS2–Ga2S3–CdS',
             'ZnF2–WO3–TeO2':'ZnF2–WO3–TeO2',
             'Cd–Se':'Cd–Se',
             'SiO2·P2O5':'SiO2–P2O5',
             'PbO–TeO2':'PbO–TeO2',
             'TiO2–SiO2':'TiO2–SiO2',
             'NO2–BF':'NO2–BF',
             'Na2O–P2O5':'Na2O–P2O5'}

In [16]:
def complex_smiles(name):
    """makes joint SMILES from a system of molecules (like A-B-C)"""
    smiles_list = []
    names = name.split('–')
    for n in names:
        try:
            smiles = pcp.get_compounds(n, 'formula')[0].isomeric_smiles
        except:
            smiles = pcp.get_compounds(n, 'name')[0].isomeric_smiles
        smiles_list.append(smiles)
    return '.'.join(smiles_list)

In [17]:
for k, v in d_correct.items():
    if '–' in v:
        try:
            d_correct[k] = complex_smiles(v)
        except:
             d_correct[k] = None
    else:
        try:
            smiles = pcp.get_compounds(v, 'formula')[0].isomeric_smiles
            d_correct[k] = smiles
        except:
            try:
                smiles = pcp.get_compounds(v, 'name')[0].isomeric_smiles
                d_correct[k] = smiles
            except:
                d_correct[k] = None
d_correct

{'As40Se60': '[As](=[Se])[Se][As]=[Se]',
 'NMFA': 'CNC1=CC=CC=C1F',
 'BiZnBo–': '[B]=O.[Zn].[Bi]',
 'Si–O': '[Si].[O-2]',
 'Li2O–GeO2': '[Li+].[Li+].[O-2].O=[Ge]=O',
 'GeO2–PbO': 'O=[Ge]=O.O=[Pb]',
 "TiO2's": 'O=[Ti]=O',
 'ZnO–Ta2O5–B2O3': '[O-2].[Zn+2].[O-2].[O-2].[O-2].[O-2].[O-2].[Ta+5].[Ta+5].B(=O)OB=O',
 'CaO–SiO2': 'O=[Ca].O=[Si]=O',
 'GeO2–Bi2O3': 'O=[Ge]=O.O=[Bi]O[Bi]=O',
 'Na2O–B2O3': '[O-2].[Na+].[Na+].B(=O)OB=O',
 'K2O–TeO2': '[O-2].[K+].[K+].O=[Te]=O',
 'CH3–CH2–O–CH2–CH3': 'CCOCC',
 'MgO–PbF2–SiO2': 'O=[Mg].F[Pb]F.O=[Si]=O',
 'O–O': 'O=O',
 'F–B': '[F-].[B]',
 'PbO–Al2O3': 'O=[Pb].[O-2].[O-2].[O-2].[Al+3].[Al+3]',
 'ZnO–Al2O3': '[O-2].[Zn+2].[O-2].[O-2].[O-2].[Al+3].[Al+3]',
 '(−)–Sparteine': 'C1CCN2C[C@@H]3C[C@H]([C@H]2C1)CN4[C@H]3CCCC4',
 'PbF2–TeO2–B2O3–Eu2O3': 'F[Pb]F.O=[Te]=O.B(=O)OB=O.[O-2].[O-2].[O-2].[Eu+3].[Eu+3]',
 'CaC2': None,
 'SiO2 – SnO2': 'O=[Si]=O.O=[Sn]=O',
 'Ag–ZnO': '[Ag].[O-2].[Zn+2]',
 'ZnO –': '[O-2].[Zn+2]',
 'La2O3–B2O3': '[O-2].[O-2].[O-2].[La+3].

In [18]:
# move SMILES from dictionary to our DataFrame
for idx in data.index:
    name = data.loc[idx, 'Name']
    if name in d_correct.keys():
        data.at[idx, 'normalised_name'] = d_correct[name]
data['normalised_name'].info()

<class 'pandas.core.series.Series'>
Int64Index: 4468 entries, 0 to 4977
Series name: normalised_name
Non-Null Count  Dtype 
--------------  ----- 
4463 non-null   object
dtypes: object(1)
memory usage: 198.9+ KB


In [19]:
# delete rows with missing "normalised_name"
data = data[data['normalised_name'].isna()==False]

In [27]:
# check if all the SMILES are valid (invalid SMILES can not be handled by RDKit)
for idx in data.index:
    smiles = data.loc[idx, 'normalised_name']
    name = data.loc[idx, 'Name']
    mol = Chem.MolFromSmiles(smiles)
    if mol == None:
        print(f'Invalid smiles: {smiles} in row {idx} for name {name}')

Invalid smiles: C.C.O.CN[OH]C in row 613 for name CH3O(CH2CH2O)nCH3
Invalid smiles: [CH222]O in row 699 for name CH222-OH
Invalid smiles: O(|[Sn](CCCC)(CCCC)CCCC)|[Sn](CCCC)(CCCC)CCCC in row 836 for name BTO
Invalid smiles: B(P)[II] in row 1306 for name Pb[II]
Invalid smiles: O(|[Sn](CCCC)(CCCC)CCCC)|[Sn](CCCC)(CCCC)CCCC in row 2042 for name BTO


In [29]:
# correction of invalid SMILES
data.loc[4, 'normalised_name'] = 'N.N.[Pt+2]'
data.loc[1306, 'normalised_name'] = '[Pb+2]'
data.loc[836, 'normalised_name'] = 'CCCC[Sn](CCCC)(CCCC)O[Sn](CCCC)(CCCC)CCCC'
data.loc[2042, 'normalised_name'] = 'CCCC[Sn](CCCC)(CCCC)O[Sn](CCCC)(CCCC)CCCC'
data = data.drop(labels=[613, 699], axis=0)

### Fixing the DOI column

To separate the column with DOI we applied regular expressions (regex)

In [8]:
regex = (r'((10\.\d{4,9}/[A-Z0-9]+[A-Z])|(10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+[0-9]))')
data_doi = data['DOI'].str.extract(regex)
data_doi = data_doi.drop(columns = [1,2])
data = pd.concat([data_doi, data], axis = 1).rename(columns = {0:'doi'}).drop(columns = "DOI")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   doi                     5000 non-null   object 
 1   Date                    4555 non-null   object 
 2   Journal                 4555 non-null   object 
 3   Title                   4555 non-null   object 
 4   Name                    4992 non-null   object 
 5   measurement_error       5000 non-null   float64
 6   measurement_wavelength  597 non-null    object 
 7   measurement_method      5000 non-null   object 
 8   normalised_name         2994 non-null   object 
 9   raw_value               5000 non-null   object 
 10  specifier               5000 non-null   object 
dtypes: float64(1), object(10)
memory usage: 429.8+ KB


In [None]:
# parse titles of articles from GoogleScholar by their doi
for idx in data.index:
    title = data.loc[idx, 'Title']
    doi = data.loc[idx, 'doi']
    if type(title)!=str:
        doi = doi.replace('/', '%2F')
        print(doi)
        url = f'https://scholar.google.ru/scholar?hl=ru&as_sdt=0%2C5&q={doi}&btnG='
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.find('h3', class_='gs_rt')
            if title!=None:
                data.loc[idx, 'Title'] = title.text
data.info()

### Descriptors search

We use RDKit API, PubChem API, and a python wrapper for PaDEL-Descriptor software to find descriptors.

In [30]:
def getMolDescriptors(mol, missingVal=None):
    """calculate the full list of RDKit descriptors for a molecule"""
    """missingVal is used when descriptor cannot be calculated"""
    res = {}
    for nm,fn in Descriptors._descList:
        try:
            val = fn(mol)
        except:
            import traceback
            traceback.print_exc()
            val = missingVal
        res[nm] = val
    return res

In [31]:
# create a separate DataFrame with RDKit descriptors
mols = []
for smiles in data['normalised_name'].tolist():
  mol = Chem.MolFromSmiles(smiles)
  mols.append(mol)
allDescrs = [getMolDescriptors(mol) for mol in mols]
rdkit_data = pd.DataFrame(allDescrs)
rdkit_data.head() # so we get 208 RDKit descriptors

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.0,0.0,0.0,0.0,0.494829,291.189,291.189,291.689311,28,4,...,0,0,0,0,0,0,0,0,0,0
1,5.833333,0.833333,5.833333,0.833333,0.480843,195.851,195.851,195.950931,10,1,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.499104,255.2,255.2,259.812449,12,4,...,0,0,0,0,0,0,0,0,0,0
3,5.555556,2.222222,5.555556,2.222222,0.256934,40.097,40.097,39.976927,8,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.62764,229.14,223.092,229.016792,24,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# create a separate DatFrame with PubChem descriptors
properties = ['MonoisotopicMass',
              'Complexity',
              'IsotopeAtomCount',
              'AtomStereoCount',
              'DefinedAtomStereoCount',
              'UndefinedAtomStereoCount',
              'BondStereoCount',
              'DefinedBondStereoCount',
              'UndefinedBondStereoCount',
              'CovalentUnitCount',
              'Volume3D',
              'XStericQuadrupole3D',
              'YStericQuadrupole3D',
              'ZStericQuadrupole3D',
              'FeatureCount3D',
              'FeatureRingCount3D',
              'FeatureHydrophobeCount3D',
              'ConformerModelRMSD3D',
              'EffectiveRotorCount3D',
              'ConformerCount3D',
              'Charge']
pubchem_descriptors = []
for smiles in data['normalised_name'].tolist():
  descriptors_list = []
  for property in properties:
      try:
        p = pcp.get_properties(property, smiles, 'smiles')
        descriptors_list.append(p[0][property])
      except:
        descriptors_list.append(None)
  pubchem_descriptors.append(descriptors_list)
pubchem_data = pd.DataFrame(columns=properties, data=pubchem_descriptors)
pubchem_data.head()

In [None]:
# create a separate DatFrame with Materials Project descriptors
with MPRester("VB1JmIovB0FutOai3Xb1ItnXhy9OkIDk") as mpr:
  mp_descriptors = []
  for smiles in data['normalised_name'].tolist():
    desc_list = []
    docs = mpr.summary.search(formula='BN', fields=['nsites', 'nelements', 'chemsys', 'volume', 'density', 'density_atomic', 'symmetry', 'deprecated', 'deprecation_reasons', 'uncorrected_energy_per_atom', 'energy_per_atom', 'formation_energy_per_atom', 'energy_above_hull', 'is_stable', 'equilibrium_reaction_energy_per_atom', 'xas', 'grain_boundaries', 'band_gap', 'cbm', 'vbm', 'efermi', 'is_gap_direct', 'is_metal', 'dos_energy_up', 'dos_energy_down', 'is_magnetic', 'ordering', 'total_magnetization', 'total_magnetization_normalized_vol', 'total_magnetization_normalized_formula_units', 'num_magnetic_sites', 'num_unique_magnetic_sites', 'types_of_magnetic_species', 'k_voigt', 'k_reuss', 'k_vrh', 'g_voigt', 'g_reuss', 'g_vrh', 'universal_anisotropy', 'homogeneous_poisson', 'e_total', 'e_ionic', 'e_electronic', 'n', 'e_ij_max', 'weighted_surface_energy_EV_PER_ANG2', 'weighted_surface_energy', 'weighted_work_function', 'surface_anisotropy', 'shape_factor', 'has_reconstructed', 'theoretical'])
    sym = str(docs[0].symmetry)
    sym_word = sym.split(' ')[1]
    sym_word = sym_word.translate(str.maketrans('', '', string.punctuation))
    xas = str(docs[0].xas)
    xas_edge = xas.split(' ')[1]
    xas_edge = xas_edge.translate(str.maketrans('', '', string.punctuation))
    xas_element = xas.split(' ')[3]
    xas_type = xas.split(' ')[5]
    xas_type = xas_type.translate(str.maketrans('', '', string.punctuation))
    for i in [docs[0].nsites, docs[0].nelements, docs[0].chemsys, docs[0].volume, docs[0].density, docs[0].density_atomic, sym_word, docs[0].deprecated, docs[0].deprecation_reasons, docs[0].uncorrected_energy_per_atom, docs[0].energy_per_atom, docs[0].formation_energy_per_atom, docs[0].energy_above_hull, docs[0].is_stable, docs[0].equilibrium_reaction_energy_per_atom, xas_edge, xas_element, xas_type, docs[0].grain_boundaries, docs[0].band_gap, docs[0].cbm, docs[0].vbm, docs[0].efermi, docs[0].is_gap_direct, docs[0].is_metal, docs[0].dos_energy_up, docs[0].dos_energy_down, docs[0].is_magnetic, docs[0].ordering, docs[0].total_magnetization, docs[0].total_magnetization_normalized_vol, docs[0].total_magnetization_normalized_formula_units, docs[0].num_magnetic_sites, docs[0].num_unique_magnetic_sites, docs[0].types_of_magnetic_species, docs[0].k_voigt, docs[0].k_reuss, docs[0].k_vrh, docs[0].g_voigt, docs[0].g_reuss, docs[0].g_vrh, docs[0].universal_anisotropy, docs[0].homogeneous_poisson, docs[0].e_total, docs[0].e_ionic, docs[0].e_electronic, docs[0].n, docs[0].e_ij_max, docs[0].weighted_surface_energy_EV_PER_ANG2, docs[0].weighted_surface_energy, docs[0].weighted_work_function, docs[0].surface_anisotropy, docs[0].shape_factor, docs[0].has_reconstructed, docs[0].theoretical]:
      desc_list.append(i)
    mp_descriptors.append(desc_list)
mp_data = pd.DataFrame(data=mp_descriptors)
mp_data.head() # so we get Materials Project descriptors

In [None]:
# create a separate DatFrame with PaDEL descriptors
mols = [Chem.MolFromSmiles(smiles) for smiles in data['normalised_name'].tolist()]
from rdkit.Chem import AllChem
mols = [Chem.AddHs(mol) for mol in mols]
mols = [AllChem.EmbedMolecule(mol) for mol in mols]
descriptors = [AtomCount, BondCount, Constitutional, EStateAtomType, ExtendedTopochemicalAtom, MDE, MLFER, RotatableBondsCount, CarbonTypes, HBondAcceptorCount, HBondDonorCount, PathCount]
padel = PaDEL(descriptors)
padel_data = padel.calculate(mols)

In [None]:
# concatenate initial dataset with descriptors datasets
frames = [data, rdkit_data, pubchem_data, padel_data]
data_final = pd.concat(frames, axis=1)
data_final

# Hometask №2

### Data visualization & statistics