In [None]:
!pip install rdkit-pypi



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from rdkit import Chem
from rdkit.Chem import Descriptors, SaltRemover, MolFromSmiles, Draw, GetFormalCharge, MolToSmiles
from rdkit.Chem.MolStandardize import rdMolStandardize
from statistics import mean

### Dataset 1 (https://github.com/theochem/B3DB)

In [None]:
data_1 = pd.read_csv('B3DB.tsv', sep='\t')
data_1.head()

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,BBB-,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,BBB-,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11 (cimetidine analog) (y-g13),2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,BBB-,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,BBB-,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,


In [None]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7807 entries, 0 to 7806
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   NO.            7807 non-null   int64  
 1   compound_name  6698 non-null   object 
 2   IUPAC_name     6170 non-null   object 
 3   SMILES         7807 non-null   object 
 4   CID            6170 non-null   float64
 5   logBB          1058 non-null   float64
 6   BBB+/BBB-      7807 non-null   object 
 7   Inchi          7807 non-null   object 
 8   threshold      3621 non-null   float64
 9   reference      7807 non-null   object 
 10  group          7807 non-null   object 
 11  comments       18 non-null     object 
dtypes: float64(3), int64(1), object(8)
memory usage: 732.0+ KB


In [None]:
data_1 = data_1[['SMILES', 'BBB+/BBB-', 'logBB']]


In [None]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7807 entries, 0 to 7806
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SMILES     7807 non-null   object 
 1   BBB+/BBB-  7807 non-null   object 
 2   logBB      1058 non-null   float64
dtypes: float64(1), object(2)
memory usage: 183.1+ KB


In [None]:
data_1['logBB'] = data_1['logBB'].fillna(1e9)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1['logBB'] = data_1['logBB'].fillna(1e9)


In [None]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7807 entries, 0 to 7806
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SMILES     7807 non-null   object 
 1   BBB+/BBB-  7807 non-null   object 
 2   logBB      7807 non-null   float64
dtypes: float64(1), object(2)
memory usage: 183.1+ KB


In [None]:
data_1 = data_1[data_1['logBB'] == 1e9]
del data_1['logBB']

In [None]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6749 entries, 1058 to 7806
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   SMILES     6749 non-null   object
 1   BBB+/BBB-  6749 non-null   object
dtypes: object(2)
memory usage: 158.2+ KB


In [None]:
data_1['SMILES'] = data_1['SMILES'].apply(lambda smi: Chem.CanonSmiles(Chem.MolToSmiles(Chem.MolFromSmiles(smi))))

In [None]:
data_1.reset_index(drop=True, inplace=True)

In [None]:
data_1.head()

Unnamed: 0,SMILES,BBB+/BBB-
0,BrC(Br)Br,BBB+
1,C#CC(C)(O)CC,BBB+
2,C#CC(O)(/C=C/Cl)CC,BBB+
3,C#CC(OC(N)=O)c1ccccc1,BBB+
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,BBB+


In [None]:
data_1['BBB+/BBB-'] = data_1['BBB+/BBB-'].apply(lambda x: 1 if x == 'BBB+' else 0)
data_1.head()

Unnamed: 0,SMILES,BBB+/BBB-
0,BrC(Br)Br,1
1,C#CC(C)(O)CC,1
2,C#CC(O)(/C=C/Cl)CC,1
3,C#CC(OC(N)=O)c1ccccc1,1
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,1


In [None]:
data_1 = data_1.rename(columns={'SMILES':'SMILES', 'BBB+/BBB-':'Activity'})
data_1.head()

Unnamed: 0,SMILES,Activity
0,BrC(Br)Br,1
1,C#CC(C)(O)CC,1
2,C#CC(O)(/C=C/Cl)CC,1
3,C#CC(OC(N)=O)c1ccccc1,1
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,1


In [None]:
data_1['Activity'].value_counts()

Activity
1    4026
0    2723
Name: count, dtype: int64

In [None]:
data_1.drop_duplicates()

Unnamed: 0,SMILES,Activity
0,BrC(Br)Br,1
1,C#CC(C)(O)CC,1
2,C#CC(O)(/C=C/Cl)CC,1
3,C#CC(OC(N)=O)c1ccccc1,1
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,1
...,...,...
6744,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,0
6745,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,1
6746,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,1
6747,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,0


In [None]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6749 entries, 0 to 6748
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   SMILES    6749 non-null   object
 1   Activity  6749 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 105.6+ KB


In [None]:
for i, row in data_1.iterrows():
    if 'C' not in row.SMILES and 'c' not in row.SMILES:
        data_1 = data_1.drop(labels=[i], axis=0)
data_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6748 entries, 0 to 6748
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   SMILES    6748 non-null   object
 1   Activity  6748 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 158.2+ KB


In [None]:
df = data_1

### **SMILES Preprocessing**

In [None]:
df['Agglomeration'] = df['SMILES'].apply(lambda x: '.' in x)
df['SMILES_clear'] = df['SMILES'].apply(lambda x: max(x.split('.'), key=len))
df['ROMol'] = df['SMILES_clear'].apply(lambda x: Chem.MolFromSmiles(x))
print(df.shape)

(6748, 5)


In [None]:
df = df[~df["ROMol"].isna()]
print(df.shape)

(6748, 5)


In [None]:
un = rdMolStandardize.Uncharger()

In [None]:
try:
    df['FORMAL_CHARGE_ch'] = df['SMILES_clear'].apply(lambda x: GetFormalCharge(Chem.MolFromSmiles(x)))
    df['SMILES_uncharge'] = df['ROMol'].apply(lambda x: Chem.MolToSmiles(un.uncharge(x), kekuleSmiles=True))
    df['FORMAL_CHARGE_unch'] = df['SMILES_uncharge'].apply(lambda x: GetFormalCharge(Chem.MolFromSmiles(x)))
except:
    pass


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running Uncharger
[14:41:34] Running 

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6748 entries, 0 to 6748
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   SMILES              6748 non-null   object
 1   Activity            6748 non-null   int64 
 2   Agglomeration       6748 non-null   bool  
 3   SMILES_clear        6748 non-null   object
 4   ROMol               6748 non-null   object
 5   FORMAL_CHARGE_ch    6748 non-null   int64 
 6   SMILES_uncharge     6748 non-null   object
 7   FORMAL_CHARGE_unch  6748 non-null   int64 
dtypes: bool(1), int64(3), object(4)
memory usage: 428.3+ KB


In [None]:
df.drop(labels=['SMILES_clear', 'ROMol', 'FORMAL_CHARGE_ch'], inplace=True, axis=1)


In [None]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6748 entries, 0 to 6748
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   SMILES              6748 non-null   object
 1   Activity            6748 non-null   int64 
 2   Agglomeration       6748 non-null   bool  
 3   SMILES_uncharge     6748 non-null   object
 4   FORMAL_CHARGE_unch  6748 non-null   int64 
dtypes: bool(1), int64(2), object(2)
memory usage: 270.2+ KB


In [None]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,SMILES,Activity,Agglomeration,SMILES_uncharge,FORMAL_CHARGE_unch
0,BrC(Br)Br,1,False,BrC(Br)Br,0
1,C#CC(C)(O)CC,1,False,C#CC(C)(O)CC,0
2,C#CC(O)(/C=C/Cl)CC,1,False,C#CC(O)(/C=C/Cl)CC,0
3,C#CC(OC(N)=O)c1ccccc1,1,False,C#CC(OC(N)=O)C1=CC=CC=C1,0
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,1,False,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,0


### **Remove inorganic**

In [None]:
index_sm = df.index.tolist()

In [None]:
print(index_sm[:5])

[0, 1, 2, 3, 4]


In [None]:
sm = df['SMILES_uncharge'].values
sm

array(['BrC(Br)Br', 'C#CC(C)(O)CC', 'C#CC(O)(/C=C/Cl)CC', ...,
       'CN1CCC(=C2C3=CC=CC=C3CC(=O)C3=C2C=CS3)CC1',
       'CC1=C(C2=CC=NC=C2)C=C(C#N)C(=O)N1', 'NC1=CC(C2=CC=NC=C2)=CNC1=O'],
      dtype=object)

In [None]:
cl = df[df['SMILES_uncharge'] == '[O-][Cl+3]([O-])([O-])O']
cl

Unnamed: 0,SMILES,Activity,Agglomeration,SMILES_uncharge,FORMAL_CHARGE_unch


In [None]:
list_inorg_el = ['Al', 'Au', 'Ar,''Ba', 'Be', 'Bi', 'Ca','Cd', 'Co','Cr','Cu','Fe', 'Gd','Ge', 'Hf',
                 'Hg', 'In', 'K', 'Kr' 'La', 'Mg', 'Mn', 'Na', 'Ni', 'Pb', 'Pt', 'Sb', 'Sn','Sr','Te',
                 'V','Zn', 'Li', 'Xe', 'Rn', 'Ne']
index_drop = []
uniq_el = set([])
for sm_i in range(len(sm)):
    mol_sm = MolFromSmiles(sm[sm_i])
    try:
        list_at = [str(x.GetSymbol()) for x in mol_sm.GetAtoms()]
        intersec = list(set(list_at) & set(list_inorg_el))
        T_C = 'C' in list_at
        if T_C == False:
            index_drop.append(index_sm[sm_i])
            print('smiles', sm[sm_i])
        elif len(intersec) > 0:
            index_drop.append(index_sm[sm_i])
            print('intersection', intersec)
        uniq_el = uniq_el | set(list_at)
    except:
         index_drop.append(index_sm[sm_i])



In [None]:
uniq_el

{'B', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'Si'}

In [None]:
index_drop

[]

In [None]:
len(index_drop)

0

In [None]:
df.drop(index=index_drop, inplace=True)

In [None]:
df.describe()

Unnamed: 0,Activity,FORMAL_CHARGE_unch
count,6748.0,6748.0
mean,0.596473,0.018672
std,0.490641,0.158569
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,3.0


In [None]:
len(df.duplicated(subset=['SMILES_uncharge'], keep=False))

6748

In [None]:
df_dupl = df[df.duplicated(subset=['SMILES_uncharge'], keep=False)]
df_dupl = df_dupl.sort_values(by=['SMILES_uncharge'])
df_dupl.to_csv('BBB_dupl.csv', index=False)
df_dupl

Unnamed: 0,SMILES,Activity,Agglomeration,SMILES_uncharge,FORMAL_CHARGE_unch
302,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,0,False,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)C3=CC=CC=C3)...,0
303,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,0,True,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)C3=CC=CC=C3)...,0
874,CNO.Cc1oc(C(C)(C)N(C(C)(C)N)C(C)(C)N)c(C)c1C,1,True,CC1=C(C)C(C)=C(C(C)(C)N(C(C)(C)N)C(C)(C)N)O1,0
2574,Cc1oc(C(C)(C)N(C(C)(C)N)C(C)(C)N)c(C)c1C,1,False,CC1=C(C)C(C)=C(C(C)(C)N(C(C)(C)N)C(C)(C)N)O1,0
4480,ClCC1CO1.NCCNCCNCCNCCN,0,True,NCCNCCNCCNCCN,0
4843,NCCNCCNCCNCCN,0,False,NCCNCCNCCNCCN,0


In [None]:
df

Unnamed: 0,SMILES,Activity,Agglomeration,SMILES_uncharge,FORMAL_CHARGE_unch
0,BrC(Br)Br,1,False,BrC(Br)Br,0
1,C#CC(C)(O)CC,1,False,C#CC(C)(O)CC,0
2,C#CC(O)(/C=C/Cl)CC,1,False,C#CC(O)(/C=C/Cl)CC,0
3,C#CC(OC(N)=O)c1ccccc1,1,False,C#CC(OC(N)=O)C1=CC=CC=C1,0
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,1,False,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,0
...,...,...,...,...,...
6743,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,0,False,C1=CC=C(CN(CC2=NCCN2)C2=CC=CC=C2)C=C1,0
6744,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,1,False,CCOCCN1C(N2CCCN(C)CC2)=NC2=CC=CC=C21,0
6745,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,1,False,CN1CCC(=C2C3=CC=CC=C3CC(=O)C3=C2C=CS3)CC1,0
6746,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,0,False,CC1=C(C2=CC=NC=C2)C=C(C#N)C(=O)N1,0


In [None]:
df = df.reset_index(drop=True)

In [None]:
df.to_csv('classification_dataset_without_descriptors.csv',index=False)