In [327]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from rdkit import Chem
from rdkit.Chem import Descriptors
from statistics import mean

### Dataset 1 (https://github.com/theochem/B3DB)

In [328]:
data_1 = pd.read_csv('B3DB_classification.tsv', sep='\t')
data_1.head()

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,BBB-,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,BBB-,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11 (cimetidine analog) (y-g13),2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,BBB-,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,BBB-,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,


In [329]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7807 entries, 0 to 7806
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   NO.            7807 non-null   int64  
 1   compound_name  6698 non-null   object 
 2   IUPAC_name     6170 non-null   object 
 3   SMILES         7807 non-null   object 
 4   CID            6170 non-null   float64
 5   logBB          1058 non-null   float64
 6   BBB+/BBB-      7807 non-null   object 
 7   Inchi          7807 non-null   object 
 8   threshold      3621 non-null   float64
 9   reference      7807 non-null   object 
 10  group          7807 non-null   object 
 11  comments       18 non-null     object 
dtypes: float64(3), int64(1), object(8)
memory usage: 732.0+ KB


In [330]:
data_1 = data_1[['SMILES', 'logBB']]


In [331]:
data_1['SMILES'] = data_1['SMILES'].apply(lambda smi: Chem.CanonSmiles(Chem.MolToSmiles(Chem.MolFromSmiles(smi))))

In [332]:
data_1.head()

Unnamed: 0,SMILES,logBB
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,-2.69
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,-2.52
2,Oc1c(I)cc(Cl)c2cccnc12,-2.4
3,CCNC(=NCCSCc1ncccc1Br)NC#N,-2.15
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,-2.15


In [333]:
data_1['logBB'] = data_1['logBB'].fillna(-1e9)
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7807 entries, 0 to 7806
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SMILES  7807 non-null   object 
 1   logBB   7807 non-null   float64
dtypes: float64(1), object(1)
memory usage: 122.1+ KB


In [334]:
data_1 = data_1[data_1['logBB'] > -1e9]
data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1058 entries, 0 to 1057
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SMILES  1058 non-null   object 
 1   logBB   1058 non-null   float64
dtypes: float64(1), object(1)
memory usage: 24.8+ KB


In [335]:
len(data_1['SMILES'].unique())

1058

In [336]:
data_1 = data_1.rename(columns={'SMILES':'SMILES', 'logBB':'Activity'})
data_1.head()

Unnamed: 0,SMILES,Activity
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,-2.69
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,-2.52
2,Oc1c(I)cc(Cl)c2cccnc12,-2.4
3,CCNC(=NCCSCc1ncccc1Br)NC#N,-2.15
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,-2.15


In [337]:
# data_1 = data_1[(data_1['Activity'] > -2) & (data_1['Activity'] < 1.5)]
# data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041 entries, 11 to 1051
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SMILES    1041 non-null   object 
 1   Activity  1041 non-null   float64
dtypes: float64(1), object(1)
memory usage: 24.4+ KB


In [338]:
# data_1

Unnamed: 0,SMILES,Activity
11,CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](N)[C@@H]1NCOC,-1.96
12,Cc1ccc(Cc2cnc(NCCCCc3ncc(Br)cc3C)[nH]c2=O)cn1,-1.90
13,COC1(NC(=O)C2SC(=C(C(N)=O)C(=O)O)S2)C(=O)N2C(C...,-1.89
14,CO[C@@]1(NC(=O)C2SC(=C(C(N)=O)C(=O)O)S2)C(=O)N...,-1.89
15,c1ccc(NC2=NCCN2)cc1,-1.89
...,...,...
1047,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,1.40
1048,CC1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,1.44
1049,CN(C)CCCN1c2ccccc2Sc2ccc(C(F)(F)F)cc21,1.44
1050,CN1CCN(CCCN2c3ccccc3Sc3cccc(C(F)(F)F)c32)CC1,1.44


In [339]:
data_1.drop_duplicates()

Unnamed: 0,SMILES,Activity
11,CCC(CC)O[C@@H]1C=C(C(=O)O)C[C@H](N)[C@@H]1NCOC,-1.96
12,Cc1ccc(Cc2cnc(NCCCCc3ncc(Br)cc3C)[nH]c2=O)cn1,-1.90
13,COC1(NC(=O)C2SC(=C(C(N)=O)C(=O)O)S2)C(=O)N2C(C...,-1.89
14,CO[C@@]1(NC(=O)C2SC(=C(C(N)=O)C(=O)O)S2)C(=O)N...,-1.89
15,c1ccc(NC2=NCCN2)cc1,-1.89
...,...,...
1047,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,1.40
1048,CC1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,1.44
1049,CN(C)CCCN1c2ccccc2Sc2ccc(C(F)(F)F)cc21,1.44
1050,CN1CCN(CCCN2c3ccccc3Sc3cccc(C(F)(F)F)c32)CC1,1.44


In [340]:
# data_1['Source'] = [['B3DB']] * len(data_1)
# data_1.head()

In [341]:
# data_1.info()

In [342]:
data_1.describe()

Unnamed: 0,Activity
count,1041.0
mean,-0.065024
std,0.713733
min,-1.96
25%,-0.52
50%,-0.01
75%,0.42
max,1.48


In [343]:
# data_1_lower_bound = round(-0.077873 - 0.751623 * 3, 2)
# data_1_upper_bound = round(-0.077873 + 0.751623 * 3, 2)

# print(data_1_lower_bound)
# print(data_1_upper_bound)

In [344]:
# #удаляем выбросы
# for i, row in data_1.iterrows():
#     if row.Activity < data_1_lower_bound or row.Activity > data_1_upper_bound:
#         data_1 = data_1.drop(labels=[i], axis=0)
# data_1.info()

In [345]:
# for i, row in data_1.iterrows():
#     if 'C' not in row.SMILES and 'c' not in row.SMILES:
#         data_1 = data_1.drop(labels=[i], axis=0)
# data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1030 entries, 11 to 1051
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SMILES    1030 non-null   object 
 1   Activity  1030 non-null   float64
dtypes: float64(1), object(1)
memory usage: 24.1+ KB


In [346]:
# data_2 = pd.read_csv('LogBBdataset529.txt', sep='\t')
# data_2.head()

Unnamed: 0,N,Structure,LogBB,Name,tissue,Ref
0,1,CCCCCC(C)C,0.86,2-Methyl heptane,Blood,1
1,2,CCCCCCC(C)C,0.98,2-Methyl octane,Blood,1
2,3,CCCCCCCC(C)C,1.05,2-Methyl nonane,Blood,1
3,4,CCC(C)CC,1.01,3-Methylpentane,Blood,2
4,5,CCCC(C)CC,0.9,3-Methylhexane,Blood,2


In [347]:
# data_2['SMILES'] = data_2['Structure'].apply(lambda smi: Chem.CanonSmiles(Chem.MolToSmiles(Chem.MolFromSmiles(smi))))
# data_2.head()

Unnamed: 0,N,Structure,LogBB,Name,tissue,Ref,SMILES
0,1,CCCCCC(C)C,0.86,2-Methyl heptane,Blood,1,CCCCCC(C)C
1,2,CCCCCCC(C)C,0.98,2-Methyl octane,Blood,1,CCCCCCC(C)C
2,3,CCCCCCCC(C)C,1.05,2-Methyl nonane,Blood,1,CCCCCCCC(C)C
3,4,CCC(C)CC,1.01,3-Methylpentane,Blood,2,CCC(C)CC
4,5,CCCC(C)CC,0.9,3-Methylhexane,Blood,2,CCCC(C)CC


In [348]:
# data_2 = data_2[['SMILES', 'LogBB']]

In [349]:
# data_2 = data_2.rename(columns={'LogBB':'Activity'})
# data_2.head()

Unnamed: 0,SMILES,Activity
0,CCCCCC(C)C,0.86
1,CCCCCCC(C)C,0.98
2,CCCCCCCC(C)C,1.05
3,CCC(C)CC,1.01
4,CCCC(C)CC,0.9


In [350]:
data_1_smiles = data_1['SMILES'].tolist()
data_2_smiles = data_2['SMILES'].tolist()

In [351]:
smiles_both = []

for smi in data_2_smiles:
    if smi in data_1_smiles:
        smiles_both.append(smi)
len(smiles_both)

477

In [352]:
only_data_2_smiles = list(set(data_2_smiles) - set(smiles_both))
len(only_data_2_smiles)

52

In [353]:
for i, row in data_2.iterrows():
    if row.SMILES in smiles_both:
        data_2 = data_2.drop(labels=[i], axis=0)
len(data_2)

52

In [354]:
data_2

Unnamed: 0,SMILES,Activity
20,FC(F)(F)CCl,0.12
127,CN1C(=O)CCC1c1cccnc1,-0.38
161,Cc1ccc(Cc2c[nH]c(NCCSCc3ccc(CN(C)C)o3)nc2=O)cn1,-1.06
213,NC(N)=Nc1nc(-c2ccccc2)cs1,-0.18
217,CN/C(=N/C#N)Nc1cccc(-c2csc(N=C(N)N)n2)c1,-1.54
238,S=C(NC1CCCCC1)N1CCC(c2c[nH]cn2)CC1,-0.16
241,CN/C(=N/C#N)NCCSCc1csc(N=C(N)N)n1,-0.82
247,CN(C)CCc1ccccn1,-0.06
248,NCCc1nccs1,-0.42
267,COc1cc(C2c3cc4c(cc3C(OC3OC5COC(C)OC5C(O)C3O)C3...,-2.0


In [355]:
data_2 = data_2[(data_2['Activity'] > -2) & (data_2['Activity'] < 1.5)]
data_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 20 to 510
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SMILES    44 non-null     object 
 1   Activity  44 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.0+ KB


In [356]:
for i, row in data_2.iterrows():
    if 'C' not in row.SMILES and 'c' not in row.SMILES:
        data_2 = data_2.drop(labels=[i], axis=0)
data_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 20 to 510
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SMILES    44 non-null     object 
 1   Activity  44 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.0+ KB


In [357]:
final_data = pd.concat([data_1, data_2], ignore_index=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SMILES    1074 non-null   object 
 1   Activity  1074 non-null   float64
dtypes: float64(1), object(1)
memory usage: 16.9+ KB


In [358]:
final_data.to_csv('refined_dataset.csv',index=False)

### Dataset 2 (https://github.com/znavoyan/vae-embeddings/, article: https://doi.org/10.1186/s13321-022-00648-x)

In [359]:
# data_2 = pd.read_csv('final_logBB_2967.csv')
# data_2.head()

In [360]:
# data_2.info()

In [361]:
# data_2 = data_2[['canon_smiles', 'new_logBB']]

In [362]:
# data_2 = data_2.rename(columns={'canon_smiles':'SMILES', 'new_logBB':'Activity'})
# data_2.head()

In [363]:
# data_2['Activity'] = data_2['Activity'].fillna(-1e9)
# data_2.info()

In [364]:
# data_2 = data_2[data_2['Activity'] > -1e9]
# data_2.info()

In [365]:
# data_2['Activity'] = data_2['Activity'].apply(lambda x: round(x, 2))

In [366]:
# data_2['Source'] = [['Tevosyan']] * len(data_2)
# data_2.info()

In [367]:
# data_2['SMILES'].nunique()

In [368]:
# data_1_smiles = data_1['SMILES'].tolist()
# data_2_smiles = data_2['SMILES'].tolist()

In [369]:
# smiles_both = []

# for smi in data_2_smiles:
#     if smi in data_1_smiles:
#         smiles_both.append(smi)
# len(smiles_both)

In [370]:
# only_data_2_smiles = list(set(data_2_smiles) - set(smiles_both))
# len(only_data_2_smiles)

In [371]:
# #усреднение 
# logBB_diff = []
# for i, row in data_1.iterrows():
#     if row.SMILES in smiles_both:
#         logBB_1 = data_1[data_1['SMILES'] == row.SMILES].iloc[0]['Activity']
#         logBB_2 = data_2[data_2['SMILES'] == row.SMILES].iloc[0]['Activity']
#         diff = round(abs(logBB_1 - logBB_2), 2)
#         logBB_diff.append(diff)
#         if diff < 0.1:
#             data_1.at[i, 'Activity'] = round(mean([logBB_1, logBB_2]), 2)
#             data_1.at[i, 'Source'] = row.Source + data_2.iloc[0]['Source']
#         else:
#             data_1 = data_1.drop(labels=[i], axis=0)
        
# data_1.head()

In [372]:
# print(len(logBB_diff))

In [373]:
# count = 0
# for diff in logBB_diff:
#     if round(diff, 2) >= 0.1:
#         count += 1
# print(count)

In [374]:
#удаление лишнего из нового датасета
# for i, row in data_2.iterrows():
#     if row.SMILES in smiles_both:
#         data_2 = data_2.drop(labels=[i], axis=0)
# len(data_2)

### Merge

In [375]:
# final_data = pd.concat([data_1, data_2], ignore_index=True)
# final_data.info()

In [376]:
# final_data.describe()

In [377]:
# final_data['Activity'].hist()

In [378]:
# sns.boxplot(x=final_data['Activity'])

In [379]:
# lower_bound = round(-0.200615 - 1.720540 * 3, 2)
# upper_bound = round(-0.200615 + 1.720540 * 3, 2)
# print(lower_bound)
# print(upper_bound)

In [380]:
#удаляем выбросы
# for i, row in final_data.iterrows():
#     if row.Activity < lower_bound or row.Activity > upper_bound:
#         final_data = final_data.drop(labels=[i], axis=0)
# final_data.info()

In [381]:
# final_data.describe()

In [382]:
# sns.boxplot(x=final_data['Activity'])

In [383]:
# final_data.to_csv('new_final_dataset.csv',index=False)