# 第4回IT創薬コンテスト
## Sirtuin 1 (SIRT1) に対する高活性化合物をEnamine社の化合物データベース(2,543,736件)から探索する

## RDKit で計算した記述子から、解析用tableを作成する

### python 3.5.3, rdkit 2017.03.1

In [1]:
import numpy as np
import pandas as pd

from rdkit import Chem

Bioactivity table, SDFおよびDragon7.0 Descriptors

In [2]:
txt_bioactivity_SIRT_IC50 = '../../data/dataset/bioactivity_table_sirtuin_IC50.txt'

txt_rdkit_MGFP4_SIRT_IC50 = '../../data/dataset/rdkit/rdkit_MGFP_1024_m4_sirtuin_IC50.txt'

活性データの読み込み

In [3]:
df_bioactivity_SIRT_IC50 = pd.read_csv(txt_bioactivity_SIRT_IC50, sep='\t')

df_bioactivity_SIRT_IC50.head()

Unnamed: 0,CMPD_CHEMBLID,MOLREGNO,PARENT_CMPD_CHEMBLID,PARENT_MOLREGNO,MOL_PREF_NAME,COMPOUND_KEY,MOLWEIGHT,ALOGP,PSA,NUM_RO5_VIOLATIONS,...,DOC_CHEMBLID,PUBMED_ID,JOURNAL,YEAR,VOLUME,ISSUE,FIRST_PAGE,CELL_ID,CELL_CHEMBL_ID,CELL_NAME
0,CHEMBL1255034,704504,CHEMBL1255034,704504,,74,465.34,-0.73,142.69,0.0,...,CHEMBL1250463,20630764.0,Bioorg. Med. Chem.,2010.0,18.0,15.0,5616.0,,,
1,CHEMBL3311074,1780180,CHEMBL3311074,1780180,,35,312.35,2.79,93.69,0.0,...,CHEMBL3351989,24880902.0,Bioorg. Med. Chem. Lett.,2014.0,24.0,14.0,3050.0,,,
2,CHEMBL3311082,1780188,CHEMBL3311082,1780188,,48,326.37,3.58,93.69,0.0,...,CHEMBL3351989,24880902.0,Bioorg. Med. Chem. Lett.,2014.0,24.0,14.0,3050.0,,,
3,CHEMBL3805929,2085823,CHEMBL3805929,2085823,,29,387.39,1.68,122.99,0.0,...,CHEMBL3804824,26982234.0,J. Med. Chem.,2016.0,59.0,7.0,2928.0,,,
4,CHEMBL3805107,2086096,CHEMBL3805107,2086096,,81,390.44,2.19,106.33,0.0,...,CHEMBL3804824,26982234.0,J. Med. Chem.,2016.0,59.0,7.0,2928.0,,,


## RDKit Morgan Fingerprint

記述子データの読み込み

In [4]:
df_rdkit_MGFP4_SIRT_IC50 = pd.read_csv(txt_rdkit_MGFP4_SIRT_IC50, sep='\t', na_values='na')

df_rdkit_MGFP4_SIRT_IC50.head()

Unnamed: 0,CHEMBLID,mgfp1,mgfp2,mgfp3,mgfp4,mgfp5,mgfp6,mgfp7,mgfp8,mgfp9,...,mgfp1015,mgfp1016,mgfp1017,mgfp1018,mgfp1019,mgfp1020,mgfp1021,mgfp1022,mgfp1023,mgfp1024
0,CHEMBL386248,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL3220463,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL3236708,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,CHEMBL1797749,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,CHEMBL215427,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


活性値と記述子の紐付け

In [5]:
df_bioactivity_rdkit_MGFP4_SIRT_IC50 = pd.merge(
    df_bioactivity_SIRT_IC50.ix[:,['CMPD_CHEMBLID', 'STANDARD_TYPE', 'STANDARD_VALUE', 'PREF_NAME']],
    df_rdkit_MGFP4_SIRT_IC50, left_on='CMPD_CHEMBLID', right_on='CHEMBLID').drop(['CHEMBLID'], axis=1)

df_bioactivity_rdkit_MGFP4_SIRT_IC50.head()

Unnamed: 0,CMPD_CHEMBLID,STANDARD_TYPE,STANDARD_VALUE,PREF_NAME,mgfp1,mgfp2,mgfp3,mgfp4,mgfp5,mgfp6,...,mgfp1015,mgfp1016,mgfp1017,mgfp1018,mgfp1019,mgfp1020,mgfp1021,mgfp1022,mgfp1023,mgfp1024
0,CHEMBL1255034,pIC50,2.752763,NAD-dependent deacetylase sirtuin 1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CHEMBL3311074,pIC50,4.114074,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL3311082,pIC50,4.542118,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL3805929,pIC50,6.316053,NAD-dependent deacetylase sirtuin 1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL3805929,pIC50,7.8041,NAD-dependent deacetylase sirtuin 2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


DataFrameの出力

In [6]:
df_bioactivity_rdkit_MGFP4_SIRT_IC50.to_csv(
    '../../data/dataset/rdkit/descriptor_table_sirtuin_IC50_rdkit_MGFP4.txt', sep='\t', index=False)