In [1]:
!pip install rdkit



In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer

In [3]:
data = pd.read_excel('/content/19_35000.xlsx')
data.head()

Unnamed: 0.1,Unnamed: 0,Title,IC50,SMILES
0,,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...
1,,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...
2,,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...
3,,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...
4,,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...


In [4]:
data.columns

Index(['Unnamed: 0', 'Title', 'IC50', 'SMILES'], dtype='object')

In [5]:
data['Unnamed: 0'].value_counts()

Series([], Name: count, dtype: int64)

In [6]:
data = data[['Title', 'IC50', 'SMILES']]

In [7]:
data = data.drop_duplicates()

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36368 entries, 0 to 36376
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Title   36368 non-null  object 
 1   IC50    36368 non-null  float64
 2   SMILES  36368 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


In [9]:
data[data.duplicated(subset='Title', keep=False)]

Unnamed: 0,Title,IC50,SMILES
4241,1007-Ya-213,0.7,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2
12805,1007-Ya-213,2.7,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2
24851,1072-Ch-4,5.0,C1C[C@H](C2)C(C)(C)C(=O)[C@@]12C
25148,1016-As-104,5.1,CC1(C)[C@H](CC2)CC(\[C@@]12C)=N\CC[N+](CC)(CC)...
28977,1062-TX-2330,6.7,CS(=O)c(n1)nn(c12)c(=O)c([N+]([O-])=O)nn2[Na]
36093,1016-As-104,113.0,CC1(C)[C@H](CC2)CC(\[C@@]12C)=N\CC[N+](CC)(CC)...
36240,1062-TX-2330,300.0,CS(=O)c(n1)nn(c12)c(=O)c([N+]([O-])=O)nn2[Na]
36335,1072-Ch-4,335.0,C1C[C@H](C2)C(C)(C)C(=O)[C@@]12C


In [10]:
duplicated_titles = data['Title'].duplicated(keep=False)
data = data[~duplicated_titles]
data[data.duplicated(subset='Title', keep=False)]

Unnamed: 0,Title,IC50,SMILES


In [11]:
data[data.duplicated(subset='SMILES', keep=False)].sort_values(by='SMILES')

Unnamed: 0,Title,IC50,SMILES
36165,1880-XE-17-3,170.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@@...
34712,1427-XE-17-3,12.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@@...
36242,1417-MO-158b-4,300.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@]...
36095,1647-MO-158b-4,113.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@]...
35374,1418-MO-164-3,30.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@]...
...,...,...,...
32586,1879-XE-47-2,8.700,s1c(Br)ccc1[C@@H](C[C@@]2(C)O)O[C@@H]([C@H]23)...
3627,CHEMBL1388738,0.582,s1cccc1C(=O)Nc(s2)ncc2[N+]([O-])=O
3927,CHEMBL1082354,0.641,s1cccc1C(=O)Nc(s2)ncc2[N+]([O-])=O
36049,1899-XE-25-2,100.000,s1cccc1[C@@H](C[C@@]2(C)O)O[C@@H]([C@H]23)C[C@...


In [12]:
duplicates = data.duplicated(subset=['SMILES', 'IC50'], keep=False)
data = data[~duplicates]
data[data.duplicated(subset='SMILES', keep=False)].sort_values(by='SMILES')

Unnamed: 0,Title,IC50,SMILES
34712,1427-XE-17-3,12.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@@...
36165,1880-XE-17-3,170.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@@...
36095,1647-MO-158b-4,113.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@]...
36242,1417-MO-158b-4,300.000,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@]...
9456,1645-MO-164-3a,1.800,C1C=C(C)[C@@H](O)[C@@H]([C@@H]12)O[C@@H](C[C@]...
...,...,...,...
36046,1878-XE-23-2,100.000,o1cccc1[C@@H](C[C@@]2(C)O)O[C@@H]([C@H]23)C[C@...
32586,1879-XE-47-2,8.700,s1c(Br)ccc1[C@@H](C[C@@]2(C)O)O[C@@H]([C@H]23)...
32291,1439-XE-47-2,8.500,s1c(Br)ccc1[C@@H](C[C@@]2(C)O)O[C@@H]([C@H]23)...
3627,CHEMBL1388738,0.582,s1cccc1C(=O)Nc(s2)ncc2[N+]([O-])=O


In [13]:
duplicates_smiles = data.duplicated(subset='SMILES', keep=False)

diff = 0.2
mean_ic50 = data.groupby('SMILES')['IC50'].transform('mean')
within_diff = abs(data['IC50'] - mean_ic50)/data['IC50'] > diff

greater_than_diff = ~within_diff & duplicates_smiles

data[greater_than_diff]

result = data[greater_than_diff].copy()
result = result.loc[result.groupby('SMILES')['IC50'].idxmax()]

result

Unnamed: 0,Title,IC50,SMILES
17979,2900-I8-105-1,3.6,C1CCCCC1[C@@H](C[C@@]2(C)O)O[C@@H]([C@H]23)C[C...
35231,1393-Ya-267,24.0,C1COCC[N+]1(C)CCC\N=C(\[C@]23C)C[C@H](C2(C)C)CC3
31131,CHEMBL1774319,7.8,C1NC[C@]2([C@@]134)[C@@H]5[C@@H]([C@H]4C=C5)[C...
11950,1490-Ks-10,2.5,CC(C)c(cc1)cc(c1[C@@]23C)CC[C@@H]2[C@](C)(CN)CCC3
34819,1518-DS-485,13.0,CC1(C)C(C)=CC[C@H]1Cc(n2)[nH]c(c23)cccc3
33961,1008-Ya-187,9.9,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2
35013,1081-1011-As-83,16.3,CC[N+](C)(CC)CC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2
34661,1391-As-106,11.7,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2
36322,CHEMBL4288514,300.3,COC(=O)CN1CCOCC1
12368,1387-Ya-282-p-3,2.6,COc(cc1)c(OC)cc1CC\N=C(\[C@]23C)C[C@H](C2(C)C)CC3


In [14]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, 17979 to 3927
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Title   24 non-null     object 
 1   IC50    24 non-null     float64
 2   SMILES  24 non-null     object 
dtypes: float64(1), object(2)
memory usage: 768.0+ bytes


In [15]:
data_temp = data[~duplicates_smiles]

data_temp = pd.concat([data_temp, result]).sort_index()

data_temp[data_temp.duplicated(subset='SMILES', keep=False)].sort_values(by='SMILES')

Unnamed: 0,Title,IC50,SMILES


In [16]:
data = data_temp
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36188 entries, 0 to 36376
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Title   36188 non-null  object 
 1   IC50    36188 non-null  float64
 2   SMILES  36188 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


удалены все идентичные строки кроме по одной
удалены все строки с повторяющимися заголовками, т.к. их разница значений большая
для строк с повторяющимися формулами оставлены только по одной с их средними значениями внутри каждой группы, из которых оставлены только те, у которых небольшие различия значений
небольшие = разница до 20% от средней внутри группы

In [17]:
def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols)

# как можно больше дескрипторов
descriptors = {
    "HeavyAtomCount": Descriptors.HeavyAtomCount,
    "NHOHCount": Descriptors.NHOHCount,
    "NOCount": Descriptors.NOCount,
    "NumHAcceptors": Descriptors.NumHAcceptors,
    "NumHDonors": Descriptors.NumHDonors,
    "NumHeteroatoms": Descriptors.NumHeteroatoms,
    "NumRotatableBonds": Descriptors.NumRotatableBonds,
    "NumValenceElectrons": Descriptors.NumValenceElectrons,
    "NumAromaticRings": Descriptors.NumAromaticRings,
    "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
    "RingCount": Descriptors.RingCount,
    "MW": Descriptors.MolWt,
    "LogP": Descriptors.MolLogP,
    "MR": Descriptors.MolMR,
    "TPSA": Descriptors.TPSA,
    "FractionCSP3": Descriptors.FractionCSP3,
    "HeavyAtomCount": Descriptors.HeavyAtomCount,
    "NumSaturatedRings": Descriptors.NumSaturatedRings,
    "NumAliphaticRings": Descriptors.NumAliphaticRings
}

# sklearn трансформер для использования в конвейерном моделировании
descriptors_transformer = FunctionTransformer(mol_dsc_calc)
X = descriptors_transformer.transform(data['SMILES'])
X.head()

[11:57:47] Conflicting single bond directions around double bond at index 55.
[11:57:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:57:47] Conflicting single bond directions around double bond at index 55.
[11:57:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:57:47] Conflicting single bond directions around double bond at index 55.
[11:57:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:57:47] Conflicting single bond directions around double bond at index 55.
[11:57:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:57:47] Conflicting single bond directions around double bond at index 55.
[11:57:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:57:47] Conflicting single bond directions around double bond at index 55.
[11:57:47]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:57:47] Conflicting single bond direc

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA,FractionCSP3,NumSaturatedRings,NumAliphaticRings
0,24,7,9,4,6,10,7,138,0,0,1,362.367,0.38187,90.4296,157.76,0.714286,0,1
1,21,4,6,4,3,6,7,120,0,0,1,298.383,1.0545,79.7279,101.65,0.733333,0,1
2,21,3,6,5,2,6,7,118,0,0,1,295.359,-0.3617,77.0531,104.48,0.6,0,1
3,20,3,6,5,2,6,6,114,0,0,1,283.348,-0.5278,72.5301,104.48,0.714286,0,1
4,24,4,9,6,4,9,3,122,3,0,3,329.268,1.09742,81.9775,149.28,0.066667,0,0


In [18]:
data_dsc = data.join(X)

In [19]:
correlation_table = data_dsc.drop(columns=['Title', 'SMILES']).corr()
correlation_table['IC50']

IC50                        1.000000
HeavyAtomCount              0.125927
NHOHCount                   0.100765
NOCount                     0.100610
NumHAcceptors               0.080983
NumHDonors                  0.107617
NumHeteroatoms              0.088251
NumRotatableBonds           0.054318
NumValenceElectrons         0.137309
NumAromaticRings           -0.059093
NumAliphaticHeterocycles    0.120229
RingCount                   0.077204
MW                          0.121040
LogP                       -0.028014
MR                          0.107514
TPSA                        0.106200
FractionCSP3                0.127635
NumSaturatedRings           0.125665
NumAliphaticRings           0.153131
Name: IC50, dtype: float64

In [20]:
data_dsc = data_dsc[['IC50', 'HeavyAtomCount', 'NumValenceElectrons', 'NumAliphaticHeterocycles', 'MW', 'FractionCSP3', 'NumSaturatedRings', 'NumAliphaticRings', 'SMILES']].head()
data_dsc

Unnamed: 0,IC50,HeavyAtomCount,NumValenceElectrons,NumAliphaticHeterocycles,MW,FractionCSP3,NumSaturatedRings,NumAliphaticRings,SMILES
0,1.5e-05,24.0,138.0,0.0,362.367,0.714286,0.0,1.0,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...
1,1.6e-05,21.0,120.0,0.0,298.383,0.733333,0.0,1.0,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...
2,3.3e-05,21.0,118.0,0.0,295.359,0.6,0.0,1.0,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...
3,3.2e-05,20.0,114.0,0.0,283.348,0.714286,0.0,1.0,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...
4,4.1e-05,24.0,122.0,0.0,329.268,0.066667,0.0,0.0,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...


In [21]:
columns = ['IC50', 'HeavyAtomCount', 'NumValenceElectrons', 'NumAliphaticHeterocycles', 'MW', 'FractionCSP3', 'NumSaturatedRings', 'NumAliphaticRings']
for column in columns:
    unique_values = data_dsc[column].unique()
    print(f"{column}: {unique_values}")

IC50: [1.45e-05 1.59e-05 3.34e-05 3.22e-05 4.15e-05]
HeavyAtomCount: [24. 21. 20.]
NumValenceElectrons: [138. 120. 118. 114. 122.]
NumAliphaticHeterocycles: [0.]
MW: [362.367 298.383 295.359 283.348 329.268]
FractionCSP3: [0.71428571 0.73333333 0.6        0.06666667]
NumSaturatedRings: [0.]
NumAliphaticRings: [1. 0.]


In [23]:
data_dsc = data_dsc[['IC50', 'HeavyAtomCount', 'NumValenceElectrons', 'MW', 'FractionCSP3', 'NumAliphaticRings', 'SMILES']]

очень неудачные дескрипторы, т.к. очень мало уникальных значений

In [24]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem


def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [25]:
Y = rdkit_fp(data['SMILES'])
Y.head()

[12:04:33] Conflicting single bond directions around double bond at index 55.
[12:04:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:04:51] Conflicting single bond directions around double bond at index 7.
[12:04:51]   BondStereo set to STEREONONE and single bond directions set to NONE.


Unnamed: 0,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,bit_id_7,bit_id_8,bit_id_9,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
data_fp = data.join(Y)
data_fp.head()

Unnamed: 0,Title,IC50,SMILES,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
columns = data_fp.drop(columns=['Title', 'SMILES']).columns
for column in columns:
    unique_values = data_fp[column].unique()
    print(f"{column}: {unique_values}")

IC50: [1.45000e-05 1.59000e-05 3.34000e-05 ... 2.60097e+03 2.79300e+03
 3.18360e+03]
bit_id_0: [ 0.  1. nan]
bit_id_1: [ 1.  0. nan]
bit_id_2: [ 0.  1. nan]
bit_id_3: [ 0.  1. nan]
bit_id_4: [ 0.  1. nan]
bit_id_5: [ 0.  1. nan]
bit_id_6: [ 0.  1. nan]
bit_id_7: [ 0.  1. nan]
bit_id_8: [ 0.  1. nan]
bit_id_9: [ 0.  1. nan]
bit_id_10: [ 0.  1. nan]
bit_id_11: [ 0.  1. nan]
bit_id_12: [ 0.  1. nan]
bit_id_13: [ 0.  1. nan]
bit_id_14: [ 0.  1. nan]
bit_id_15: [ 0.  1. nan]
bit_id_16: [ 0.  1. nan]
bit_id_17: [ 0.  1. nan]
bit_id_18: [ 0.  1. nan]
bit_id_19: [ 0.  1. nan]
bit_id_20: [ 0.  1. nan]
bit_id_21: [ 0.  1. nan]
bit_id_22: [ 0.  1. nan]
bit_id_23: [ 0.  1. nan]
bit_id_24: [ 0.  1. nan]
bit_id_25: [ 0.  1. nan]
bit_id_26: [ 0.  1. nan]
bit_id_27: [ 0.  1. nan]
bit_id_28: [ 0.  1. nan]
bit_id_29: [ 0.  1. nan]
bit_id_30: [ 0.  1. nan]
bit_id_31: [ 0.  1. nan]
bit_id_32: [ 0.  1. nan]
bit_id_33: [ 0.  1. nan]
bit_id_34: [ 0.  1. nan]
bit_id_35: [ 0.  1. nan]
bit_id_36: [ 0.  1. nan]


In [70]:
data_fp[data_fp.isnull().any(axis=1)]

Unnamed: 0,Title,IC50,SMILES,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
36188,CHEMBL4796323,193.048,CC(C)c1ccc(cc1)NC(=C2C(=O)OCC)C(=O)N(c(cc3)ccc...,,,,,,,,...,,,,,,,,,,
36189,CHEMBL4537459,198.900,C[C@@]12C(C)(C)[C@H](CC2)CC\1=N/NC(=O)[C@@H]([...,,,,,,,,...,,,,,,,,,,
36190,5889-LFC-KVS-536-1,199.000,c1ccccc1-c(c(c2=O)C(=O)OCC)oc(c23)c(F)c(c(F)c3...,,,,,,,,...,,,,,,,,,,
36191,CHEMBL5176004,199.100,O=C1C[C@@H](CC2)C(C)(C)[C@]12c(on3)nc3C,,,,,,,,...,,,,,,,,,,
36192,CHEMBL2041483,199.429,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36372,CHEMBL3699085,2366.790,CC(=O)N[C@H]1CC[C@@H]([C@@H]12)[C@@H](O[Si](C)...,,,,,,,,...,,,,,,,,,,
36373,CHEMBL109004,2372.100,CC(=O)Nc(cc1)c(OC(=O)C)cc1C(=O)O,,,,,,,,...,,,,,,,,,,
36374,CHEMBL2259758,2600.970,c1cccc(c12)cccc2CNC(=O)CCCCCCO[C@]3(C(=O)O)C[C...,,,,,,,,...,,,,,,,,,,
36375,CHEMBL109781,2793.000,NCCCC(=O)Nc(c(cc1)NC(=O)C)cc1C(=O)O,,,,,,,,...,,,,,,,,,,


In [71]:
data_fp.dropna(inplace=True)

In [72]:
data_fp.columns[data_fp.nunique() == 1]

Index([], dtype='object')

In [73]:
correlation_table = data_fp.drop(columns=['Title', 'SMILES']).corr()
correlation_table['IC50']

IC50           1.000000
bit_id_0       0.025474
bit_id_1       0.044873
bit_id_2      -0.001625
bit_id_3       0.012701
                 ...   
bit_id_2043    0.044307
bit_id_2044    0.061584
bit_id_2045   -0.002050
bit_id_2046    0.000366
bit_id_2047    0.005329
Name: IC50, Length: 2049, dtype: float64

In [74]:
abs(correlation_table['IC50']).sort_values()

bit_id_1306    0.000025
bit_id_681     0.000030
bit_id_1943    0.000036
bit_id_1076    0.000045
bit_id_1004    0.000050
                 ...   
bit_id_528     0.121294
bit_id_1680    0.123124
bit_id_1019    0.126485
bit_id_1060    0.135825
IC50           1.000000
Name: IC50, Length: 2049, dtype: float64

In [82]:
correlation_table[abs(correlation_table['IC50']) > 0.065].index

Index(['IC50', 'bit_id_78', 'bit_id_123', 'bit_id_222', 'bit_id_227',
       'bit_id_245', 'bit_id_260', 'bit_id_418', 'bit_id_482', 'bit_id_519',
       'bit_id_528', 'bit_id_598', 'bit_id_602', 'bit_id_605', 'bit_id_679',
       'bit_id_750', 'bit_id_764', 'bit_id_880', 'bit_id_883', 'bit_id_926',
       'bit_id_1019', 'bit_id_1060', 'bit_id_1124', 'bit_id_1154',
       'bit_id_1162', 'bit_id_1257', 'bit_id_1274', 'bit_id_1279',
       'bit_id_1303', 'bit_id_1434', 'bit_id_1474', 'bit_id_1680',
       'bit_id_1750', 'bit_id_1803', 'bit_id_1831', 'bit_id_1883',
       'bit_id_1921', 'bit_id_2035'],
      dtype='object')

In [83]:
rows_above_threshold = correlation_table[abs(correlation_table['IC50']) > 0.065].index
columns_names = list(rows_above_threshold) + ['SMILES']
data_fp = data_fp[columns_names]
data_fp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36021 entries, 0 to 36187
Data columns (total 39 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   IC50         36021 non-null  float64
 1   bit_id_78    36021 non-null  float64
 2   bit_id_123   36021 non-null  float64
 3   bit_id_222   36021 non-null  float64
 4   bit_id_227   36021 non-null  float64
 5   bit_id_245   36021 non-null  float64
 6   bit_id_260   36021 non-null  float64
 7   bit_id_418   36021 non-null  float64
 8   bit_id_482   36021 non-null  float64
 9   bit_id_519   36021 non-null  float64
 10  bit_id_528   36021 non-null  float64
 11  bit_id_598   36021 non-null  float64
 12  bit_id_602   36021 non-null  float64
 13  bit_id_605   36021 non-null  float64
 14  bit_id_679   36021 non-null  float64
 15  bit_id_750   36021 non-null  float64
 16  bit_id_764   36021 non-null  float64
 17  bit_id_880   36021 non-null  float64
 18  bit_id_883   36021 non-null  float64
 19  bit_id_92

In [84]:
data_fp.to_excel('data_fp.xlsx')

In [37]:
Z = rdkit_2d(data['SMILES'])
Z.head()

[12:35:23] Conflicting single bond directions around double bond at index 55.
[12:35:23]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:35:23] Conflicting single bond directions around double bond at index 55.
[12:35:23]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:35:23] Conflicting single bond directions around double bond at index 55.
[12:35:23]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:35:23] Conflicting single bond directions around double bond at index 55.
[12:35:23]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:35:23] Conflicting single bond directions around double bond at index 55.
[12:35:23]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:35:23] Conflicting single bond directions around double bond at index 55.
[12:35:23]   BondStereo set to STEREONONE and single bond directions set to NONE.
[12:35:23] Conflicting single bond direc

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,11.743425,11.743425,0.061289,-4.487931,0.216971,25.916667,362.367,335.151,362.171907,138,...,0,0,0,0,0,0,0,0,0,0
1,11.324462,11.324462,0.124444,-0.95902,0.651957,25.571429,298.383,272.175,298.189257,120,...,0,0,0,0,0,0,0,0,0,0
2,11.521766,11.521766,0.02529,-1.252691,0.624022,25.333333,295.359,272.175,295.166331,118,...,0,0,0,0,0,0,0,0,0,0
3,11.271766,11.271766,0.000752,-1.23623,0.681758,26.25,283.348,260.164,283.166331,114,...,0,0,0,0,0,0,0,0,0,0
4,11.745432,11.745432,0.243422,-1.443615,0.561612,10.708333,329.268,318.18,329.064785,122,...,0,0,0,0,0,0,0,0,0,0


In [48]:
data_2d = data.join(Z)
data_2d.head()

Unnamed: 0,Title,IC50,SMILES,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,11.743425,11.743425,0.061289,-4.487931,0.216971,25.916667,362.367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,11.324462,11.324462,0.124444,-0.95902,0.651957,25.571429,298.383,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,11.521766,11.521766,0.02529,-1.252691,0.624022,25.333333,295.359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,11.271766,11.271766,0.000752,-1.23623,0.681758,26.25,283.348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,11.745432,11.745432,0.243422,-1.443615,0.561612,10.708333,329.268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
columns = data_2d.drop(columns=['Title', 'SMILES']).columns
for column in columns:
    unique_values = data_2d[column].unique()
    print(f"{column}: {unique_values}")

IC50: [1.45000e-05 1.59000e-05 3.34000e-05 ... 2.60097e+03 2.79300e+03
 3.18360e+03]
MaxAbsEStateIndex: [11.7434253  11.32446192 11.52176603 ... 13.53209928 11.66370937
         nan]
MaxEStateIndex: [11.7434253  11.32446192 11.52176603 ... 13.53209928 11.66370937
         nan]
MinAbsEStateIndex: [0.06128858 0.12444444 0.02528959 ... 0.10804096 0.02150639        nan]
MinEStateIndex: [-4.4879308  -0.95902032 -1.25269059 ... -4.3271315  -1.11404429
         nan]
qed: [0.21697133 0.65195651 0.62402199 ... 0.04694963 0.61904855        nan]
SPS: [25.91666667 25.57142857 25.33333333 ... 35.56521739 20.13114754
         nan]
MolWt: [362.367 298.383 295.359 ... 866.987 279.296     nan]
HeavyAtomMolWt: [335.151 272.175 260.164 ... 576.471 812.555     nan]
ExactMolWt: [362.17190659 298.18925731 295.1663308  ... 866.34080879 279.12190602
          nan]
NumValenceElectrons: [138. 120. 118. 114. 122. 136. 132. 148. 142. 130. 166. 160. 124. 174.
 178. 158. 144. 102. 134. 164. 126. 170.  92. 108. 258.

In [52]:
data_2d[data_2d.isnull().any(axis=1)]

Unnamed: 0,Title,IC50,SMILES,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
31,CHEMBL2315323,0.000439,OC[C@@H](O)[C@@H](O)[C@H](O1)[C@H](NC(=O)C)[C@...,10.037257,10.037257,0.173899,-1.148887,0.469185,30.200000,345.217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,CHEMBL471524,0.007750,OC[C@@H]1[C@@H](O)[C@@H](O)[C@@H](O1)c2[nH]nc(...,9.972250,9.972250,0.373033,-1.158878,0.433315,31.157895,331.190,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,CHEMBL1256019,0.044000,O=C(O)C1=CC=C[C@H](C1)N,12.959132,12.959132,0.000000,-1.859716,0.262698,16.500000,477.275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2555,CHEMBL3196537,0.382000,CC(C)CNC(=O)CC(=O)N\N=C\c1ccc(Br)cc1,12.138731,12.138731,0.078403,0.078403,0.622431,10.750000,274.181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35894,CHEMBL3792635,71.900000,c1ccccc1-c(cc2)nc(c2C#N)SCC(=O)Nc(cc3C)ccc3,8.207077,8.207077,0.285738,0.285738,0.408594,38.375000,220.320,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36372,CHEMBL3699085,2366.790000,CC(=O)N[C@H]1CC[C@@H]([C@@H]12)[C@@H](O[Si](C)...,,,,,,,,...,,,,,,,,,,
36373,CHEMBL109004,2372.100000,CC(=O)Nc(cc1)c(OC(=O)C)cc1C(=O)O,,,,,,,,...,,,,,,,,,,
36374,CHEMBL2259758,2600.970000,c1cccc(c12)cccc2CNC(=O)CCCCCCO[C@]3(C(=O)O)C[C...,,,,,,,,...,,,,,,,,,,
36375,CHEMBL109781,2793.000000,NCCCC(=O)Nc(c(cc1)NC(=O)C)cc1C(=O)O,,,,,,,,...,,,,,,,,,,


In [56]:
data_2d.dropna(inplace=True)

In [58]:
data_2d.columns[data_2d.nunique() == 1]

Index(['NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'fr_diazo',
       'fr_isocyan', 'fr_isothiocyan', 'fr_prisulfonamd'],
      dtype='object')

In [59]:
data_2d = data_2d.drop(columns=data_2d.columns[data_2d.nunique() == 1])

In [60]:
columns = data_2d.drop(columns=['Title', 'SMILES']).columns
for column in columns:
    unique_values = data_2d[column].unique()
    print(f"{column}: {unique_values}")

IC50: [1.45000e-05 1.59000e-05 3.34000e-05 ... 1.88300e+02 1.88799e+02
 1.91190e+02]
MaxAbsEStateIndex: [11.7434253  11.32446192 11.52176603 ... 10.90204664 13.53209928
 11.66370937]
MaxEStateIndex: [11.7434253  11.32446192 11.52176603 ... 10.90204664 13.53209928
 11.66370937]
MinAbsEStateIndex: [0.06128858 0.12444444 0.02528959 ... 0.00703704 0.10804096 0.02150639]
MinEStateIndex: [-4.4879308  -0.95902032 -1.25269059 ... -1.14557681 -4.3271315
 -1.11404429]
qed: [0.21697133 0.65195651 0.62402199 ... 0.60956983 0.04694963 0.61904855]
SPS: [25.91666667 25.57142857 25.33333333 ... 23.24137931 35.56521739
 20.13114754]
MolWt: [362.367 298.383 295.359 ... 237.211 866.987 279.296]
HeavyAtomMolWt: [335.151 272.175 260.164 ... 200.109 576.471 812.555]
ExactMolWt: [362.17190659 298.18925731 295.1663308  ... 622.31866172 866.34080879
 279.12190602]
NumValenceElectrons: [138. 120. 118. 114. 122. 136. 132. 148. 142. 130. 166. 160. 124. 174.
 178. 158. 144. 102. 134. 164. 126. 170.  92. 258. 330. 

In [61]:
correlation_table = data_2d.drop(columns=['Title', 'SMILES']).corr()
correlation_table['IC50']

IC50                 1.000000
MaxAbsEStateIndex    0.054745
MaxEStateIndex       0.054745
MinAbsEStateIndex   -0.044859
MinEStateIndex      -0.055290
                       ...   
fr_thiazole         -0.016763
fr_thiocyan         -0.002380
fr_thiophene        -0.005169
fr_unbrch_alkane     0.008402
fr_urea             -0.002574
Name: IC50, Length: 204, dtype: float64

In [62]:
abs(correlation_table['IC50']).sort_values()

fr_priamide           0.000935
fr_lactam             0.000947
fr_azide              0.001020
SMR_VSA9              0.001183
fr_dihydropyridine    0.001207
                        ...   
fr_Al_OH_noTert       0.163090
fr_Al_OH              0.180396
BCUT2D_CHGHI          0.192025
SPS                   0.207953
IC50                  1.000000
Name: IC50, Length: 204, dtype: float64

In [65]:
correlation_table[abs(correlation_table['IC50']) > 0.1].index

Index(['IC50', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt',
       'NumValenceElectrons', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI',
       'BCUT2D_LOGPLOW', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v',
       'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'Kappa1',
       'LabuteASA', 'PEOE_VSA1', 'SMR_VSA1', 'SMR_VSA4', 'SMR_VSA5',
       'SlogP_VSA2', 'SlogP_VSA5', 'TPSA', 'EState_VSA1', 'EState_VSA10',
       'VSA_EState3', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount',
       'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumHDonors',
       'NumSaturatedRings', 'MolMR', 'fr_Al_OH', 'fr_Al_OH_noTert'],
      dtype='object')

In [66]:
rows_above_threshold = correlation_table[abs(correlation_table['IC50']) > 0.1].index
columns_names = list(rows_above_threshold) + ['SMILES']
data_2d = data_2d[columns_names]
data_2d.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36015 entries, 0 to 36187
Data columns (total 47 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   IC50                      36015 non-null  float64
 1   qed                       36015 non-null  float64
 2   SPS                       36015 non-null  float64
 3   MolWt                     36015 non-null  float64
 4   HeavyAtomMolWt            36015 non-null  float64
 5   ExactMolWt                36015 non-null  float64
 6   NumValenceElectrons       36015 non-null  float64
 7   BCUT2D_CHGHI              36015 non-null  float64
 8   BCUT2D_CHGLO              36015 non-null  float64
 9   BCUT2D_LOGPHI             36015 non-null  float64
 10  BCUT2D_LOGPLOW            36015 non-null  float64
 11  Chi0                      36015 non-null  float64
 12  Chi0n                     36015 non-null  float64
 13  Chi0v                     36015 non-null  float64
 14  Chi1       

для каждого дескриптора удалены строки с пустыми значениями, столбцы с единственными уникальными значениями, посчитана корреляция и оставленны тольколь около 40 самых коррелирующих столбцов

In [67]:
data_2d.to_excel('data_2d.xlsx')