In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVR 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, precision_recall_curve, auc

rates = 2**np.arange(7)/80
print(rates)

[0.0125 0.025  0.05   0.1    0.2    0.4    0.8   ]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def get_inputs(sm):
    seq_len = 220
    sm = sm.split()
    if len(sm)>218:
        print('SMILES is too long ({:d})'.format(len(sm)))
        sm = sm[:109]+sm[-109:]
    ids = [vocab.stoi.get(token, unk_index) for token in sm]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    return ids, seg

def get_array(smiles):
    x_id, x_seg = [], []
    for sm in smiles:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)

### ECFP4

In [None]:
pip install rdkit


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

def bit2np(bitvector):
    bitstring = bitvector.ToBitString()
    intmap = map(int, bitstring)
    return np.array(list(intmap))

def extract_morgan(smiles, targets):
    x,X,y = [],[],[]
    for sm,target in zip(smiles,targets):
        mol = Chem.MolFromSmiles(sm)
        if mol is None:
            print(sm)
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) # Morgan (Similar to ECFP4)
        x.append(sm)
        X.append(bit2np(fp))
        y.append(target)
    return x,np.array(X),np.array(y)

### ST, RNN, BERT

In [None]:
cd /content/drive/MyDrive/smiles-transformer-master 2/smiles_transformer

/content/drive/MyDrive/smiles-transformer-master 2/smiles_transformer


In [None]:
import torch
from pretrain_trfm import TrfmSeq2seq
from pretrain_rnn import RNNSeq2Seq
#import BERT
from build_vocab import WordVocab
from utils import split

pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

vocab = WordVocab.load_vocab('/content/drive/MyDrive/smiles-transformer-master 2/smiles_transformer/vocab.pkl')

trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
trfm.load_state_dict(torch.load('/content/drive/MyDrive/smiles-transformer-master 2/smiles_transformer/trfm_12_23000.pkl'))
trfm.eval()
print('Total parameters:', sum(p.numel() for p in trfm.parameters()))



Total parameters: 4245037


# Evaluation

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.neural_network import MLPRegressor

def evaluate_regression(X, y, rate, n_repeats, model='ridge'):
    r2, rmse = np.empty(n_repeats), np.empty(n_repeats)
    for i in range(n_repeats):
        if model=='ridge':
            reg = Ridge()
        elif model=='rf':
            reg = RandomForestRegressor()
        elif model == 'svr':
            reg = SVR()
        elif model == 'dnn':
            reg = MLPRegressor(hidden_layer_sizes=(128,64,64), activation='relu', solver='adam', max_iter=7000)

        else:
            raise ValueError('Model "{}" is invalid. Specify "ridge" or "rf".'.format(model))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-rate)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        r2[i] = r2_score(y_test,y_pred)
        rmse[i] = mean_squared_error(y_pred, y_test)**0.5
    ret = {}
    ret['r2 mean'] = np.mean(r2)
    ret['r2 std'] = np.std(r2)
    ret['rmse mean'] = np.mean(rmse)
    ret['rmse std'] = np.std(rmse)
    return ret

## Lipo

In [None]:
df = pd.read_csv('/content/drive/MyDrive/smiles-transformer-master 2/experiments/all_structures.csv')
print(df.shape)
df.head()

(931, 2)


Unnamed: 0,smiles,activity
0,Cl.N=C(NCCCNCCCCCCCNCCCNC(=N)NC(=N)NCCC(c1cccc...,4.89
1,CC(C)C[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](C...,4.81
2,C#CCNCCCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(...,4.98
3,NNCCc1ccccc1,4.86
4,N[C@@H]1C[C@H]1c1ccccc1,4.56


### ST

In [None]:
x_split = [split(sm) for sm in df['smiles'].values]
xid, xseg = get_array(x_split)
X = trfm.encode(torch.t(xid))
print(X.shape)

SMILES is too long (382)
SMILES is too long (390)
SMILES is too long (378)
SMILES is too long (385)
SMILES is too long (393)
SMILES is too long (383)
SMILES is too long (385)
SMILES is too long (385)
SMILES is too long (379)
SMILES is too long (389)
SMILES is too long (390)
SMILES is too long (394)
SMILES is too long (400)
SMILES is too long (381)
SMILES is too long (389)
SMILES is too long (383)
SMILES is too long (384)
SMILES is too long (389)
SMILES is too long (384)
SMILES is too long (320)
SMILES is too long (246)
SMILES is too long (378)
SMILES is too long (385)
SMILES is too long (392)
SMILES is too long (372)
SMILES is too long (377)
SMILES is too long (409)
SMILES is too long (445)
SMILES is too long (388)
SMILES is too long (383)
SMILES is too long (377)
SMILES is too long (407)
SMILES is too long (370)
SMILES is too long (373)
SMILES is too long (244)
SMILES is too long (245)
SMILES is too long (595)
SMILES is too long (385)
SMILES is too long (390)
SMILES is too long (573)


In [None]:
scores = []
for rate in rates:
    score_dic = evaluate_regression(X, df['activity'].values, rate, 5, model='dnn')
    print(rate, score_dic)
    scores.append(score_dic['rmse mean'])
print(np.mean(scores))

0.0125 {'r2 mean': -0.28692216104921464, 'r2 std': 0.2961968407748827, 'rmse mean': 1.2411557224446328, 'rmse std': 0.14237146877317358}
0.025 {'r2 mean': 0.0008869112732035278, 'r2 std': 0.07679918382265963, 'rmse mean': 1.0993362126175097, 'rmse std': 0.04127629250396949}
0.05 {'r2 mean': -0.09082265006158645, 'r2 std': 0.06850599183416058, 'rmse mean': 1.14774353528348, 'rmse std': 0.03265514141595499}
0.1 {'r2 mean': 0.15047939724843046, 'r2 std': 0.050494625023855745, 'rmse mean': 1.0122907079505388, 'rmse std': 0.03361111949041205}
0.2 {'r2 mean': 0.2984949006227612, 'r2 std': 0.07169983663845937, 'rmse mean': 0.9194584900641676, 'rmse std': 0.04638156234621075}
0.4 {'r2 mean': 0.4989876986651402, 'r2 std': 0.034562269780273776, 'rmse mean': 0.7779930295482297, 'rmse std': 0.027454094433433468}
0.8 {'r2 mean': 0.5919753376336457, 'r2 std': 0.05030060733151411, 'rmse mean': 0.7082906530480938, 'rmse std': 0.030821258477312694}
0.9866097644223789


In [None]:
scores = []
for rate in rates:
    score_dic = evaluate_regression(X, df['activity'].values, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['rmse mean'])
print(np.mean(scores))

0.0125 {'r2 mean': -1.8402477250976381, 'r2 std': 2.0465026903221957, 'rmse mean': 1.1222135958245039, 'rmse std': 0.11999111513162691}
0.025 {'r2 mean': -0.4513586262418084, 'r2 std': 0.3892575588266544, 'rmse mean': 1.2041805247231647, 'rmse std': 0.1175231306776083}
0.05 {'r2 mean': -0.16238384070955414, 'r2 std': 0.27880727249102427, 'rmse mean': 1.1693983739947873, 'rmse std': 0.0978309182095059}
0.1 {'r2 mean': 0.03776326295591344, 'r2 std': 0.14621918452651658, 'rmse mean': 1.000758473641287, 'rmse std': 0.050276024235619374}
0.2 {'r2 mean': 0.1890934124544769, 'r2 std': 0.11342018926065335, 'rmse mean': 0.8980676148584873, 'rmse std': 0.03978866939789854}
0.4 {'r2 mean': 0.3224726069993986, 'r2 std': 0.05519794216824727, 'rmse mean': 0.8233744906399927, 'rmse std': 0.0276216490105769}
0.8 {'r2 mean': 0.3824061754891057, 'r2 std': 0.08720717012110286, 'rmse mean': 0.7440444032169008, 'rmse std': 0.04615323054723108}
0.9945767824141604


In [None]:
scores = []
for rate in rates:
    score_dic = evaluate_regression(X, df['activity'].values, rate, 20, model='svr')
    print(rate, score_dic)
    scores.append(score_dic['rmse mean'])
print(np.mean(scores))

0.0125 {'r2 mean': -26.20938735825556, 'r2 std': 18.94625175510239, 'rmse mean': 1.0960162993919762, 'rmse std': 0.08684031473729982}
0.025 {'r2 mean': -8.104204289561478, 'r2 std': 5.615780969572795, 'rmse mean': 1.0385705813350694, 'rmse std': 0.07758705548994023}
0.05 {'r2 mean': -3.2245751479269367, 'r2 std': 1.7778394115650298, 'rmse mean': 0.9614562517564508, 'rmse std': 0.04853167224689596}
0.1 {'r2 mean': -1.3978682258327844, 'r2 std': 0.5123744801159066, 'rmse mean': 0.9178634840633642, 'rmse std': 0.023244136620381075}
0.2 {'r2 mean': -0.6642538646556142, 'r2 std': 0.25903870319284716, 'rmse mean': 0.866834816971793, 'rmse std': 0.01345790077599399}
0.4 {'r2 mean': -0.2946361594057978, 'r2 std': 0.0964672562223178, 'rmse mean': 0.8256311099477973, 'rmse std': 0.019584563796711668}
0.8 {'r2 mean': -0.08725773276342416, 'r2 std': 0.284580186980647, 'rmse mean': 0.7854133677373, 'rmse std': 0.05924832259331485}
0.9273979873148216


In [None]:
scores = []
for rate in rates:
    score_dic = evaluate_regression(X, df['activity'].values, rate, 20, model='rf')
    print(rate, score_dic)
    scores.append(score_dic['rmse mean'])
print(np.mean(scores))

0.0125 {'r2 mean': -12.67867333830144, 'r2 std': 5.841628450460784, 'rmse mean': 1.0518998428321837, 'rmse std': 0.056320217199405345}
0.025 {'r2 mean': -5.918409598803441, 'r2 std': 4.7606614734351975, 'rmse mean': 1.0223135199775486, 'rmse std': 0.06307986118472118}
0.05 {'r2 mean': -2.5713872087344307, 'r2 std': 1.320339891850383, 'rmse mean': 0.9552092650351378, 'rmse std': 0.03961727063844697}
0.1 {'r2 mean': -1.0108887207959583, 'r2 std': 0.34033217903561014, 'rmse mean': 0.8794538284888658, 'rmse std': 0.02035092140373427}


KeyboardInterrupt: ignored

### ECFP

### ECFP

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/smiles-transformer-master 2/dataset.csv')
X = dataset.iloc[:,:203]
y = dataset.iloc[:,203]

In [None]:
y.shape

(930,)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
reg = SVR()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_pred, y_test)**0.5
rmse

0.784212305094353

In [None]:
import xgboost as xgb
reg = xgb.XGBRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_pred, y_test)**0.5




In [None]:
r2

0.41678945087543673