In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import typing
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from typing import Union, List, Tuple
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
from rdkit.Chem import GraphDescriptors
from tqdm import tqdm

### Helper functions

In [22]:
def count_atoms(smils: Chem.Mol, atomic_value: int) -> int:
    count = 0
    smilsChromoChem = smils
    for atom in smilsChromoChem.GetAtoms():
        if atom.GetAtomicNum() == atomic_value:
            count += 1
    return count

def find_pattern_indexes(molecule: Chem.Mol, patterns_dataframe: pd.DataFrame) -> List[int]:
    ret = []
    for index, row in patterns_dataframe.iterrows():
        if len(list(molecule.GetSubstructMatch(row["RDKit_Mol_Class"])))>0:
            ret.append(index)
    return ret

## Read databases

In [23]:
chromophore_database = pd.read_csv("DB for chromophore_Sci_Data_rev02.csv")
chromophore_database.drop("Tag", inplace=True, axis=1)
chromophore_database = chromophore_database.iloc[:, 0:4]
chromophore_database.drop(columns="Absorption max (nm)", inplace=True)
only_solids = chromophore_database[chromophore_database["Chromophore"]==chromophore_database["Solvent"]].copy()
only_solids["RDKit_Mol_Class"] = only_solids["Chromophore"].apply(lambda x: Chem.MolFromSmiles(x))
qm9_database = pd.read_csv("qm9.csv")

### Find patterns from QM9 database
Needs time.

In [24]:
qm9_database["RDKit_Mol_Class"] = qm9_database["smiles"].apply(lambda x: Chem.MolFromSmiles(x))
only_solids["qm9_pattern_indexes"] = only_solids["RDKit_Mol_Class"].apply(find_pattern_indexes, patterns_dataframe = qm9_database)

### Count number of each atoms of N, O, S, F, Cl, Br, I, Se, Te, Si, P, B, Sn, Ge

In [25]:
for atom, value in tqdm(zip(
    [x.strip() for x in "N, O, S, F, Cl, Br, I, Se, Te, Si, P, B, Sn, Ge".split(",")],
    [7, 8, 16, 9, 17, 35, 53, 34, 52, 14, 15, 5, 50, 32]
)):
    only_solids[f"{atom}_count"] = only_solids["RDKit_Mol_Class"].apply(count_atoms, atomic_value=value)

14it [00:01,  7.90it/s]


In [26]:
only_solids.head()

Unnamed: 0,Chromophore,Solvent,Emission max (nm),RDKit_Mol_Class,qm9_pattern_indexes,N_count,O_count,S_count,F_count,Cl_count,Br_count,I_count,Se_count,Te_count,Si_count,P_count,B_count,Sn_count,Ge_count
1322,O=c1oc2ccccc2cc1-c1cn2ccccc2n1,O=c1oc2ccccc2cc1-c1cn2ccccc2n1,520.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 4994, ...",2,2,0,0,0,0,0,0,0,0,0,0,0,0
1323,CCN(CC)c1ccc2cc(-c3cn4ccccc4n3)c(=O)oc2c1,CCN(CC)c1ccc2cc(-c3cn4ccccc4n3)c(=O)oc2c1,593.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",3,2,0,0,0,0,0,0,0,0,0,0,0,0
1324,CCN(CC)c1ccc2cc(-c3cn4cc(C)ccc4n3)c(=O)oc2c1,CCN(CC)c1ccc2cc(-c3cn4cc(C)ccc4n3)c(=O)oc2c1,557.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 931, 9...",3,2,0,0,0,0,0,0,0,0,0,0,0,0
1325,CCN(CC)c1ccc2cc(-c3cn4cc5ccccc5cc4n3)c(=O)oc2c1,CCN(CC)c1ccc2cc(-c3cn4cc5ccccc5cc4n3)c(=O)oc2c1,590.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",3,2,0,0,0,0,0,0,0,0,0,0,0,0
1548,COc1ccc(/C=C(\C#N)c2nc(-c3ccc(OC)cc3)cs2)cc1,COc1ccc(/C=C(\C#N)c2nc(-c3ccc(OC)cc3)cs2)cc1,526.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 4, 6, 7, 9, 12, 14, 30, 213, 929, 94...",2,2,1,0,0,0,0,0,0,0,0,0,0,0


In [27]:
only_solids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 956 entries, 1322 to 20213
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Chromophore          956 non-null    object 
 1   Solvent              956 non-null    object 
 2   Emission max (nm)    897 non-null    float64
 3   RDKit_Mol_Class      956 non-null    object 
 4   qm9_pattern_indexes  956 non-null    object 
 5   N_count              956 non-null    int64  
 6   O_count              956 non-null    int64  
 7   S_count              956 non-null    int64  
 8   F_count              956 non-null    int64  
 9   Cl_count             956 non-null    int64  
 10  Br_count             956 non-null    int64  
 11  I_count              956 non-null    int64  
 12  Se_count             956 non-null    int64  
 13  Te_count             956 non-null    int64  
 14  Si_count             956 non-null    int64  
 15  P_count              956 non-null  

### Create subset of chromophores consisting of C, O, N, F atoms

In [28]:
only_solids_conf = only_solids.copy()
for atom in tqdm([x.strip() for x in "S, Cl, Br, I, Se, Te, Si, P, B, Sn, Ge".split(",")]):
    only_solids_conf.drop(only_solids_conf[only_solids_conf[f"{atom}_count"]>0].index, inplace=True)
    only_solids_conf.drop(columns=f"{atom}_count", inplace=True)

100%|██████████| 11/11 [00:00<00:00, 54.85it/s]


In [29]:
only_solids_conf.head()

Unnamed: 0,Chromophore,Solvent,Emission max (nm),RDKit_Mol_Class,qm9_pattern_indexes,N_count,O_count,F_count
1322,O=c1oc2ccccc2cc1-c1cn2ccccc2n1,O=c1oc2ccccc2cc1-c1cn2ccccc2n1,520.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 4994, ...",2,2,0
1323,CCN(CC)c1ccc2cc(-c3cn4ccccc4n3)c(=O)oc2c1,CCN(CC)c1ccc2cc(-c3cn4ccccc4n3)c(=O)oc2c1,593.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",3,2,0
1324,CCN(CC)c1ccc2cc(-c3cn4cc(C)ccc4n3)c(=O)oc2c1,CCN(CC)c1ccc2cc(-c3cn4cc(C)ccc4n3)c(=O)oc2c1,557.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 931, 9...",3,2,0
1325,CCN(CC)c1ccc2cc(-c3cn4cc5ccccc5cc4n3)c(=O)oc2c1,CCN(CC)c1ccc2cc(-c3cn4cc5ccccc5cc4n3)c(=O)oc2c1,590.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 5, 6, 50, 201, 213, 214, 924, 939, 4...",3,2,0
2412,COc1ccc(N(c2ccccc2)c2ccccc2)cc1,COc1ccc(N(c2ccccc2)c2ccccc2)cc1,373.0,<rdkit.Chem.rdchem.Mol object at 0x000001CF14F...,"[0, 1, 2, 7, 14, 213, 939, 948, 4581, 5388, 53...",1,1,0


In [30]:
only_solids_conf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 1322 to 20213
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Chromophore          558 non-null    object 
 1   Solvent              558 non-null    object 
 2   Emission max (nm)    523 non-null    float64
 3   RDKit_Mol_Class      558 non-null    object 
 4   qm9_pattern_indexes  558 non-null    object 
 5   N_count              558 non-null    int64  
 6   O_count              558 non-null    int64  
 7   F_count              558 non-null    int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 59.2+ KB


## Save databases

In [31]:
only_solids.drop(columns=["RDKit_Mol_Class", "Solvent"]).to_json("only_solids.json")
only_solids_conf.drop(columns=["RDKit_Mol_Class", "Solvent"]).to_json("only_solids_conf.json")