# Calculate RDKit molecular descriptors, MACCS keys and Morgan FPs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import typing
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from typing import Union, List, Tuple
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
from rdkit.Chem import GraphDescriptors
from tqdm import tqdm

# Read data

In [3]:
qm9_database = pd.read_csv("qm9.csv")
qm9_database["RDKit_Mol_Class"] = qm9_database["smiles"].apply(Chem.MolFromSmiles)
only_solids = pd.read_json("only_solids.json").reset_index(drop=True)
only_solids_conf = pd.read_json("only_solids_conf.json").reset_index(drop=True)
only_solids["RDKit_Mol_Class"] = only_solids["Chromophore"].apply(Chem.MolFromSmiles)
only_solids_conf["RDKit_Mol_Class"] = only_solids_conf["Chromophore"].apply(Chem.MolFromSmiles)

In [4]:
first_4_cols = ["Chromophore", 'qm9_pattern_indexes', 'RDKit_Mol_Class', "Emission max (nm)"]
only_solids = pd.concat([only_solids[first_4_cols], only_solids.drop(columns=first_4_cols)], axis=1)
only_solids_conf = pd.concat([only_solids_conf[first_4_cols], only_solids_conf.drop(columns=first_4_cols)], axis=1)

In [5]:
only_solids_conf.shape

(558, 7)

In [6]:
only_solids.shape

(956, 18)

In [7]:
from rdkit.ML.Descriptors import MoleculeDescriptors
# from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
descriptors_names = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
calculator = MoleculeDescriptors.MolecularDescriptorCalculator( descriptors_names )
descriptors = pd.DataFrame()
maccs_keys = pd.DataFrame()
morgan_fp = pd.DataFrame()
for mol in tqdm(only_solids["RDKit_Mol_Class"]):
    descriptors = descriptors.append([calculator.CalcDescriptors(mol)])
    maccs_keys = maccs_keys.append(pd.Series(np.asarray(MACCSkeys.GenMACCSKeys(mol))), ignore_index=True)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    fp_vect = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, fp_vect)
    morgan_fp = morgan_fp.append(pd.Series(fp_vect), ignore_index=True)

cols = list(only_solids.columns) + descriptors_names + [f"MACCS_key{x}" for x in range(maccs_keys.shape[1])] + [f"MorganFP_bit_{x}" for x in range(1024)]
only_solids = pd.concat([only_solids, descriptors.reset_index(drop=True), maccs_keys.reset_index(drop=True), morgan_fp.reset_index(drop=True)], axis=1)
only_solids.columns = cols


100%|██████████| 956/956 [01:02<00:00, 15.18it/s]


In [8]:
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys
descriptors_names = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
calculator = MoleculeDescriptors.MolecularDescriptorCalculator( descriptors_names )
descriptors = pd.DataFrame()
maccs_keys = pd.DataFrame()
morgan_fp = pd.DataFrame()

for mol in tqdm(only_solids_conf["RDKit_Mol_Class"]):
    descriptors = descriptors.append([calculator.CalcDescriptors(mol)])
    maccs_keys = maccs_keys.append(pd.Series(np.asarray(MACCSkeys.GenMACCSKeys(mol))), ignore_index=True)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    fp_vect = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, fp_vect)
    morgan_fp = morgan_fp.append(pd.Series(fp_vect), ignore_index=True)

cols = list(only_solids_conf.columns) + descriptors_names + [f"MACCS_key{x}" for x in range(maccs_keys.shape[1])] + [f"MorganFP_bit_{x}" for x in range(1024)]
only_solids_conf = pd.concat([only_solids_conf, descriptors.reset_index(drop=True), maccs_keys.reset_index(drop=True), morgan_fp.reset_index(drop=True)], axis=1)
only_solids_conf.columns = cols


100%|██████████| 558/558 [00:36<00:00, 15.11it/s]


In [9]:
only_solids_conf.head()

Unnamed: 0,Chromophore,qm9_pattern_indexes,RDKit_Mol_Class,Emission max (nm),N_count,O_count,F_count,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,...,MorganFP_bit_1014,MorganFP_bit_1015,MorganFP_bit_1016,MorganFP_bit_1017,MorganFP_bit_1018,MorganFP_bit_1019,MorganFP_bit_1020,MorganFP_bit_1021,MorganFP_bit_1022,MorganFP_bit_1023
0,O=c1oc2ccccc2cc1-c1cn2ccccc2n1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 4994, ...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,520.0,2,2,0,12.094502,-0.36842,12.094502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCN(CC)c1ccc2cc(-c3cn4ccccc4n3)c(=O)oc2c1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,593.0,3,2,0,12.501134,-0.36796,12.501134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,CCN(CC)c1ccc2cc(-c3cn4cc(C)ccc4n3)c(=O)oc2c1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 931, 9...",<rdkit.Chem.rdchem.Mol object at 0x000001D850D...,557.0,3,2,0,12.568071,-0.366366,12.568071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,CCN(CC)c1ccc2cc(-c3cn4cc5ccccc5cc4n3)c(=O)oc2c1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",<rdkit.Chem.rdchem.Mol object at 0x000001D850D...,590.0,3,2,0,12.735007,-0.373002,12.735007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,COc1ccc(N(c2ccccc2)c2ccccc2)cc1,"[0, 1, 2, 7, 14, 213, 939, 948, 4581, 5388, 53...",<rdkit.Chem.rdchem.Mol object at 0x000001D850D...,373.0,1,1,0,5.241821,0.860948,5.241821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
only_solids.head()

Unnamed: 0,Chromophore,qm9_pattern_indexes,RDKit_Mol_Class,Emission max (nm),N_count,O_count,S_count,F_count,Cl_count,Br_count,...,MorganFP_bit_1014,MorganFP_bit_1015,MorganFP_bit_1016,MorganFP_bit_1017,MorganFP_bit_1018,MorganFP_bit_1019,MorganFP_bit_1020,MorganFP_bit_1021,MorganFP_bit_1022,MorganFP_bit_1023
0,O=c1oc2ccccc2cc1-c1cn2ccccc2n1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 4994, ...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,520.0,2,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCN(CC)c1ccc2cc(-c3cn4ccccc4n3)c(=O)oc2c1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,593.0,3,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,CCN(CC)c1ccc2cc(-c3cn4cc(C)ccc4n3)c(=O)oc2c1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 931, 9...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,557.0,3,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,CCN(CC)c1ccc2cc(-c3cn4cc5ccccc5cc4n3)c(=O)oc2c1,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,590.0,3,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,COc1ccc(/C=C(\C#N)c2nc(-c3ccc(OC)cc3)cs2)cc1,"[0, 1, 2, 4, 6, 7, 9, 12, 14, 30, 213, 929, 94...",<rdkit.Chem.rdchem.Mol object at 0x000001D852F...,526.0,2,2,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
only_solids.drop(columns="RDKit_Mol_Class").to_json("only_solids_features.json")
only_solids_conf.drop(columns="RDKit_Mol_Class").to_json("only_solids_conf_features.json")