# 第4回IT創薬コンテスト
## Sirtuin 1 (SIRT1) に対する高活性化合物をEnamine社の化合物データベース(2,543,736件)から探索する

## SVRでモデリングを行う
### Tanimoto kernelを使用
### C, epsilonはグリッドサーチで最適化

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

Descriptor table

In [2]:
df_bioactivity_rdkit_MGFP_SIRT_IC50 = pd.read_csv('../../data/dataset/rdkit/descriptor_table_sirtuin_IC50_rdkit_MGFP4.txt', '\t')

df_bioactivity_rdkit_MGFP_SIRT_IC50

Unnamed: 0,CMPD_CHEMBLID,STANDARD_TYPE,STANDARD_VALUE,PREF_NAME,mgfp1,mgfp2,mgfp3,mgfp4,mgfp5,mgfp6,...,mgfp1015,mgfp1016,mgfp1017,mgfp1018,mgfp1019,mgfp1020,mgfp1021,mgfp1022,mgfp1023,mgfp1024
0,CHEMBL1255034,pIC50,2.752763,NAD-dependent deacetylase sirtuin 1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CHEMBL3311074,pIC50,4.114074,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL3311082,pIC50,4.542118,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL3805929,pIC50,6.316053,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL3805929,pIC50,7.804100,NAD-dependent deacetylase sirtuin 2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,CHEMBL3805929,pIC50,5.913640,NAD-dependent deacetylase sirtuin 3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,CHEMBL3805107,pIC50,5.518557,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,CHEMBL3805107,pIC50,6.982967,NAD-dependent deacetylase sirtuin 2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,CHEMBL3805107,pIC50,5.252588,NAD-dependent deacetylase sirtuin 3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,CHEMBL2332040,pIC50,7.508638,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Target(NAD-dependent deacetylase sirtuin 1)に対する活性データの抽出

In [3]:
y = df_bioactivity_rdkit_MGFP_SIRT_IC50.ix[
    df_bioactivity_rdkit_MGFP_SIRT_IC50['PREF_NAME'] == 'NAD-dependent deacetylase sirtuin 1', 'STANDARD_VALUE'].as_matrix()

df_X = df_bioactivity_rdkit_MGFP_SIRT_IC50.ix[
    df_bioactivity_rdkit_MGFP_SIRT_IC50['PREF_NAME'] == 'NAD-dependent deacetylase sirtuin 1', :].drop(
    ['CMPD_CHEMBLID', 'STANDARD_TYPE', 'STANDARD_VALUE', 'PREF_NAME'], axis=1)

Training data、test dataにデータを分割

In [4]:
train_X, test_X, train_y, test_y = train_test_split(df_X.as_matrix(), y, test_size=0.1, random_state=0)

## Kernel functionにTanimoto kernelを用いた場合

Training dataを用いてSVRモデルを構築

In [5]:
def tanimoto_kernel(X, Y):
    k = (X[:, np.newaxis, :] == Y[np.newaxis, :, :]).sum(axis=2)
    return k / (X.shape[1]*2 - k)

parameters = {'C' : [2**i for i in range(1, 10)],
              'epsilon' : [2**i for i in range(-10, -1)]}
clf = GridSearchCV(SVR(kernel='precomputed'), parameters)
clf.fit(tanimoto_kernel(train_X, train_X), np.squeeze(train_y))

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='precomputed', max_iter=-1, shrinking=True, tol=0.001,
  verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [2, 4, 8, 16, 32, 64, 128, 256, 512], 'epsilon': [0.0009765625, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

予測精度を評価

In [6]:
pred_y = clf.best_estimator_.predict(tanimoto_kernel(train_X, train_X))

print('Train R2   : %.3f'%(r2_score(np.squeeze(train_y), pred_y)))
print('     RMSE  : %.3f'%(mean_squared_error(np.squeeze(train_y), pred_y)**0.5))
print('      MAE  : %.3f'%(mean_absolute_error(np.squeeze(train_y), pred_y)))

Train R2   : 0.909
     RMSE  : 0.317
      MAE  : 0.184


In [7]:
pred_y = clf.best_estimator_.predict(tanimoto_kernel(test_X, train_X))

print('Test R2   : %.3f'%(r2_score(np.squeeze(test_y), pred_y)))
print('    RMSE  : %.3f'%(mean_squared_error(np.squeeze(test_y), pred_y)**0.5))
print('     MAE  : %.3f'%(mean_absolute_error(np.squeeze(test_y), pred_y)))

Test R2   : 0.712
    RMSE  : 0.489
     MAE  : 0.355
