In [1]:
import pandas as pd

In [2]:
import deepchem as dc

In [3]:
# ! pip install deepchem --user

# Cargo dataset

In [4]:
df = pd.read_csv('data/acetylcholinesterase_02_bioactivity_data_preprocessed.csv')

In [5]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,750.0,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,100.0,7.000000
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,50000.0,4.301030
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,300.0,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,800.0,6.096910
...,...,...,...,...,...
5659,CHEMBL4779440,Brc1ccc([N+]2=Cc3ccccc3CC2)cc1.[Br-],10500.0,10500.0,4.978811
5660,CHEMBL417799,C[n+]1cc2c3c(ccc2c2ccc4cc5c(cc4c21)OCO5)OCO3,1220.0,1220.0,5.913640
5661,CHEMBL13045,COc1ccc2c(c[n+](C)c3c4cc5c(cc4ccc23)OCO5)c1OC,1450.0,1450.0,5.838632
5662,CHEMBL3085398,C=CC(C)(C)c1cc([C@@H]2CC(=O)c3c(O)cc(O)c(CC=C(...,28980.0,28980.0,4.537902


# Pongo el dataset en el formato que deepchem necesita
Quizas aca se podía bajar directo en el formato

In [6]:
dataset = dc.data.NumpyDataset(X=df['canonical_smiles'].values, y=df['pIC50'].values, ids=df['canonical_smiles'].values)

# Divido en train y test
Faltaría validación estrictametne

In [7]:
splitter = dc.splits.RandomSplitter()

In [8]:
train, test = splitter.train_test_split(dataset)

# Genero el diccionario de smiles que necesita el modelo para contruir el embedding

In [9]:
smiles_dict, lenght = dc.models.TextCNNModel.build_char_dict(dataset)

In [10]:
smiles_dict

{'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '/': 6,
 '1': 7,
 '2': 8,
 '3': 9,
 '4': 10,
 '5': 11,
 '6': 12,
 '7': 13,
 '8': 14,
 '=': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 '\\': 25,
 ']': 26,
 '_': 27,
 'c': 28,
 'Cl': 29,
 'Br': 30,
 'n': 31,
 'o': 32,
 's': 33,
 '@': 34,
 '.': 35,
 'a': 36,
 'B': 37,
 'e': 38,
 'i': 39}

# Instancio el modelo

In [11]:
model = dc.models.TextCNNModel(
    1, smiles_dict, lenght, mode='regression', n_embedding=128, kernel_sizes=[3, 4, 5], num_filters=[128, 128, 128], dropout=0., 
    log_frequency=5, batch_size=128, optimizer=dc.models.optimizers.Adam()
)

In [12]:
model.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 248)]        0           []                               
                                                                                                  
 dtnn_embedding (DTNNEmbedding)  (None, 248, 128)    5120        ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 246, 128)     49280       ['dtnn_embedding[0][0]']         
                                                                                                  
 conv1d_1 (Conv1D)              (None, 245, 128)     65664       ['dtnn_embedding[0][0]']         
                                                                                              

In [13]:
# Es necesario para ver resultados mientras entrana
vc = dc.models.ValidationCallback(test, 20, dc.metrics.r2_score)

# Entreno

In [14]:
model.fit(train, 100, callbacks=[vc])

Step 20 validation: metric-1=-0.323348
Step 40 validation: metric-1=0.184278
Step 60 validation: metric-1=0.263897
Step 80 validation: metric-1=0.289333
Step 100 validation: metric-1=0.335718
Step 120 validation: metric-1=0.361595
Step 140 validation: metric-1=0.393589
Step 160 validation: metric-1=0.421935
Step 180 validation: metric-1=0.449273
Step 200 validation: metric-1=0.483604
Step 220 validation: metric-1=0.508941
Step 240 validation: metric-1=0.45803
Step 260 validation: metric-1=0.512572
Step 280 validation: metric-1=0.572717
Step 300 validation: metric-1=0.585479
Step 320 validation: metric-1=0.57499
Step 340 validation: metric-1=0.59193
Step 360 validation: metric-1=0.627074
Step 380 validation: metric-1=0.63761
Step 400 validation: metric-1=0.636363
Step 420 validation: metric-1=0.663334
Step 440 validation: metric-1=0.605761
Step 460 validation: metric-1=0.668174
Step 480 validation: metric-1=0.674544
Step 500 validation: metric-1=0.685864
Step 520 validation: metric-1=0.

0.24329516887664795

# Evaluó

In [15]:
model.evaluate(train, dc.metrics.r2_score)

{'metric-1': 0.9251056024692074}

In [16]:
model.evaluate(test, dc.metrics.r2_score)

{'metric-1': 0.7252653390538207}