In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [6]:
df_data = pd.read_csv('./mol_train.csv')
df_train, df_val = train_test_split(df_data, test_size=0.2, random_state=42)
df_test = pd.read_csv('./mol_test.csv')
print(len(df_train),len(df_val),len(df_test))
print(df_test)

560 140 367
                                                SMILES  target_cns
0    CC(CCC(=O)O)C1CCC2C3C(CC(=O)C12C)C4(C)CCC(=O)C...           0
1         CC(=O)c1ccc2c(c1)Sc3ccccc3N2CCCN4CCN(CC4)CCO           0
2    CCCN(CCC)C(=O)C(CCC(=O)OCCCN1CCN(CCOC(=O)Cc2c(...           0
3    CC(C)CCCC(C)CCCC(C)CCCC1(C)CCc2c(C)c(O)c(C)c(C...           0
4                         CCCN(CCC)CCc1cccc2c1CC(=O)N2           0
..                                                 ...         ...
362  CC(C)(C)C(=O)OCOP(=O)(COCCn1cnc2c(N)ncnc12)OCO...           0
363  COC1=CC=C2C(=CC1=O)C(CCc3cc(OC)c(OC)c(OC)c23)N...           0
364                CC(CN1c2ccccc2Sc3c1cc(cc3)OC)CN(C)C           0
365  [H][C@]12SCC(C)=C(N1C(=O)[C@H]2NC(=O)[C@H](N)C...           0
366                    Cc1onc(NS(=O)(=O)c2ccc(N)cc2)c1           0

[367 rows x 2 columns]


In [10]:
X_fp_train = np.array([GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), 2, nBits=1024) for x in df_train['SMILES']])
X_fp_val = np.array([GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), 2, nBits=1024) for x in df_val['SMILES']])
X_fp = np.concatenate((X_fp_train, X_fp_val), axis=0)
print(X_fp_train.shape, X_fp_val.shape)

(560, 1024) (140, 1024)


In [11]:
(X_fp == 0).mean()

0.9609919084821429

In [14]:
ridge = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=1).fit(X_fp_train, df_train['target_cns'])
y_pred = ridge.predict(X_fp_val)
print(X_fp,y_pred)


[0.06290328 0.99665251 0.10835096 0.93038157 0.11702461 0.16196984
 0.94965432 0.06153611 0.85678072 0.69818447 0.07040305 0.29462783
 0.12527821 0.89170594 0.22588741 0.73       0.64344218 0.09050956
 0.07830873 0.0739066  0.18899242 0.20388454 0.080295   0.0975402
 0.08075076 0.10988778 0.26692391 0.11234487 0.37283398 0.31674992
 0.93699174 0.09980694 0.08084851 0.06328632 0.22988489 0.84779621
 0.1176768  0.14489258 0.06282411 0.12415469 0.11047901 0.12070345
 0.12961555 0.11333502 0.10239737 0.25213906 0.17072109 0.89369288
 0.07789537 0.97665251 0.95590625 0.60743473 0.3913016  0.93601641
 0.98351106 0.36936936 0.20671792 0.5224796  0.45899284 0.77963975
 0.26833025 0.18888406 0.20687319 0.18472102 0.06333795 0.11238515
 0.9772862  0.34878604 0.23366678 0.20974002 0.74824166 0.81887124
 0.12505971 0.07515587 0.13873146 0.92345374 0.99665251 0.09247534
 0.16711265 0.14894792 0.07553401 0.89264837 0.19062051 0.41525619
 0.1008509  0.13752176 0.97665251 0.80616271 0.29292753 0.16860