In [1]:
import pandas as pd

In [2]:
import deepchem as dc

# Cargo dataset

In [3]:
df = pd.read_csv('data/acetylcholinesterase_02_bioactivity_data_preprocessed.csv')

In [4]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,750.0,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,100.0,7.000000
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,50000.0,4.301030
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,300.0,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,800.0,6.096910
...,...,...,...,...,...
5098,CHEMBL4645476,CN(C)C(=O)Oc1ccc(C(O)CNC2CCCCC2)cc1.Cl,266000.0,266000.0,3.575118
5099,CHEMBL4645659,COc1ccc(CCC(=O)Nc2nc(-c3cc4ccccc4oc3=O)cs2)cc1OC,740.0,740.0,6.130768
5100,CHEMBL513063,COc1ccc(-c2csc(NC(=O)CCN3CCCC3)n2)cc1,510.0,510.0,6.292430
5101,CHEMBL4640608,COc1cc(C2C3=C(CCCC3=O)NC3=C2C(=O)CCC3)ccc1OCc1...,125000.0,125000.0,3.903090


# Pongo el dataset en el formato que deepchem necesita
Quizas aca se podía bajar directo en el formato

In [5]:
dataset = dc.data.NumpyDataset(X=df['canonical_smiles'].values, y=df['pIC50'].values, ids=df['canonical_smiles'].values)

# Divido en train y test
Faltaría validación estrictametne

In [6]:
splitter = dc.splits.RandomSplitter()

In [7]:
train, test = splitter.train_test_split(dataset)

# Genero el diccionario de smiles que necesita el modelo para contruir el embedding

In [8]:
smiles_dict, lenght = dc.models.TextCNNModel.build_char_dict(dataset)

In [9]:
smiles_dict

{'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '/': 6,
 '1': 7,
 '2': 8,
 '3': 9,
 '4': 10,
 '5': 11,
 '6': 12,
 '7': 13,
 '8': 14,
 '=': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 '\\': 25,
 ']': 26,
 '_': 27,
 'c': 28,
 'Cl': 29,
 'Br': 30,
 'n': 31,
 'o': 32,
 's': 33,
 '@': 34,
 '.': 35,
 'a': 36,
 'B': 37,
 'e': 38,
 'i': 39}

# Instancio el modelo

In [10]:
model = dc.models.TextCNNModel(
    1, smiles_dict, lenght, mode='regression', n_embedding=128, kernel_sizes=[3, 4, 5], num_filters=[128, 128, 128], dropout=0., 
    log_frequency=5, batch_size=128, optimizer=dc.models.optimizers.Adam()
)

In [11]:
model.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 248)]        0                                            
__________________________________________________________________________________________________
dtnn_embedding (DTNNEmbedding)  (None, 248, 128)     5120        input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 246, 128)     49280       dtnn_embedding[0][0]             
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 245, 128)     65664       dtnn_embedding[0][0]             
______________________________________________________________________________________________

In [12]:
# Es necesario para ver resultados mientras entrana
vc = dc.models.ValidationCallback(test, 20, dc.metrics.r2_score)

# Entreno

In [13]:
model.fit(train, 100, callbacks=[vc])

Step 20 validation: metric-1=-0.228998
Step 40 validation: metric-1=0.219221
Step 60 validation: metric-1=0.279928
Step 80 validation: metric-1=0.319328
Step 100 validation: metric-1=0.35241
Step 120 validation: metric-1=0.390695
Step 140 validation: metric-1=0.427813
Step 160 validation: metric-1=0.450775
Step 180 validation: metric-1=0.50461
Step 200 validation: metric-1=0.511492
Step 220 validation: metric-1=0.543713
Step 240 validation: metric-1=0.570686
Step 260 validation: metric-1=0.603076
Step 280 validation: metric-1=0.605956
Step 300 validation: metric-1=0.614017
Step 320 validation: metric-1=0.636365
Step 340 validation: metric-1=0.655272
Step 360 validation: metric-1=0.671507
Step 380 validation: metric-1=0.667063
Step 400 validation: metric-1=0.664802
Step 420 validation: metric-1=0.681529
Step 440 validation: metric-1=0.689974
Step 460 validation: metric-1=0.692201
Step 480 validation: metric-1=0.68003
Step 500 validation: metric-1=0.687827
Step 520 validation: metric-1=0

0.19754068851470946

# Evaluó

In [14]:
model.evaluate(train, dc.metrics.r2_score)

{'metric-1': 0.9370228493631341}

In [15]:
model.evaluate(test, dc.metrics.r2_score)

{'metric-1': 0.6967739933221686}