# ¿Que más se podría hacer?

- Test con data augmentation y la LSTM
- Usar los tokenizadores usados en los transformers para ver como funcionan
- TSNE de embeddings y analizar si hay interpretación
- Probar con otras encimas o proteinas
- Usar los embeddings entrenados para analizar resultados de proteinas o encimas con menos data
- Entrenar una red neuronal con los features (fingerprints por ejemplo) y comparar los resultados con los embeddings


# Tome cualquiera de estas propuestas o alguna suya y desarrolle

## PROPUESTA DE TSNE

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import deepchem as dc

%matplotlib inline
%load_ext autoreload
%autoreload 2

### Fuente de los fingersprint

#### RXNFP - huellas dactilares de reaccion quimica
#### Esta biblioteca genera huellas de reaccion quimica a partir de la reaccion SMILES

In [32]:
df = pd.read_csv('data/acetylcholinesterase_02_bioactivity_data_preprocessed.csv')

In [33]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,750.0,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,100.0,7.000000
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,50000.0,4.301030
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,300.0,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,800.0,6.096910
...,...,...,...,...,...
5659,CHEMBL4779440,Brc1ccc([N+]2=Cc3ccccc3CC2)cc1.[Br-],10500.0,10500.0,4.978811
5660,CHEMBL417799,C[n+]1cc2c3c(ccc2c2ccc4cc5c(cc4c21)OCO5)OCO3,1220.0,1220.0,5.913640
5661,CHEMBL13045,COc1ccc2c(c[n+](C)c3c4cc5c(cc4ccc23)OCO5)c1OC,1450.0,1450.0,5.838632
5662,CHEMBL3085398,C=CC(C)(C)c1cc([C@@H]2CC(=O)c3c(O)cc(O)c(CC=C(...,28980.0,28980.0,4.537902


In [34]:
## dataset en el formato que deepchem necesita
dataset = dc.data.NumpyDataset(X=df['canonical_smiles'].values, y=df['pIC50'].values, ids=df['canonical_smiles'].values)

In [35]:
# Divido en train y test
splitter = dc.splits.RandomSplitter()

In [36]:
train, test = splitter.train_test_split(dataset)

In [37]:
# Genero el diccionario de smiles que necesita el modelo para contruir el embedding

smiles_dict, lenght = dc.models.TextCNNModel.build_char_dict(dataset)
smiles_dict

{'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '/': 6,
 '1': 7,
 '2': 8,
 '3': 9,
 '4': 10,
 '5': 11,
 '6': 12,
 '7': 13,
 '8': 14,
 '=': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 '\\': 25,
 ']': 26,
 '_': 27,
 'c': 28,
 'Cl': 29,
 'Br': 30,
 'n': 31,
 'o': 32,
 's': 33,
 '@': 34,
 '.': 35,
 'a': 36,
 'B': 37,
 'e': 38,
 'i': 39}

In [38]:
# Instancio el modelo

model = dc.models.TextCNNModel(
    1, smiles_dict, lenght, mode='regression', n_embedding=64, kernel_sizes=[3, 4, 5], num_filters=[128, 128, 128], dropout=0., 
    log_frequency=5, batch_size=128, optimizer=dc.models.optimizers.Adam(learning_rate = 0.001)
)

In [39]:
model.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 248)]        0           []                               
                                                                                                  
 dtnn_embedding (DTNNEmbedding)  (None, 248, 64)     2560        ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 246, 128)     24704       ['dtnn_embedding[0][0]']         
                                                                                                  
 conv1d_1 (Conv1D)              (None, 245, 128)     32896       ['dtnn_embedding[0][0]']         
                                                                                              

In [40]:
# from tensorflow import keras
# from keras.callbacks import ReduceLROnPlateau

In [41]:
# Es necesario para ver resultados mientras entrana
vc = dc.models.ValidationCallback(test, 20, dc.metrics.r2_score)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
#                               patience=5, min_lr=0.001)

In [42]:
model.fit(train, 100, callbacks=[vc])

Step 20 validation: metric-1=-0.338553
Step 40 validation: metric-1=0.181188
Step 60 validation: metric-1=0.227852
Step 80 validation: metric-1=0.274388
Step 100 validation: metric-1=0.26033
Step 120 validation: metric-1=0.321654
Step 140 validation: metric-1=0.342436
Step 160 validation: metric-1=0.376926
Step 180 validation: metric-1=0.391561
Step 200 validation: metric-1=0.41627
Step 220 validation: metric-1=0.453782
Step 240 validation: metric-1=0.462522
Step 260 validation: metric-1=0.481525
Step 280 validation: metric-1=0.509208
Step 300 validation: metric-1=0.48231
Step 320 validation: metric-1=0.539548
Step 340 validation: metric-1=0.547029
Step 360 validation: metric-1=0.537304
Step 380 validation: metric-1=0.583382
Step 400 validation: metric-1=0.585573
Step 420 validation: metric-1=0.509397
Step 440 validation: metric-1=0.584046
Step 460 validation: metric-1=0.615412
Step 480 validation: metric-1=0.599575
Step 500 validation: metric-1=0.604413
Step 520 validation: metric-1=0

0.1615348219871521

In [43]:
# Evaluó
model.evaluate(train, dc.metrics.r2_score)

{'metric-1': 0.9257251429333502}

In [44]:
model.evaluate(test, dc.metrics.r2_score)

{'metric-1': 0.6824379391862421}

### Embedding_128_dropout_train = 0.9298857325977588
### Embedding_128_dropout_test = 0.6216029859047306
-----------------------------------------------------
### Embedding_50_dropout_0.3_train = 0.8946426394134877
### Embedding_50_dropout_0.3_test = 0.6877007791554051
-----------------------------------------------------
### Embedding_20_dropout_0.5_train = 0.8727642490048564
### Embedding_20_dropout_0.5_test = 0.6753481678191662
-----------------------------------------------------
### Embedding_30_dropout_0.5_train = 0.8730533918094836
### Embedding_30_dropout_0.5_test = 0.6366937879196792
-----------------------------------------------------
### Embedding_50_dropout_0.5_train = 0.8763333518543397
### Embedding_50_dropout_0.5_test = 0.6371364404412685
-----------------------------------------------------
### Embedding_50_dropout_0.5_lr_0.0005_train = 0.8148850371685529
### Embedding_50_dropout_0.5_lr_0.0005_test = 0.60923386242113
-----------------------------------------------------
### Embedding_50_dropout_0_lr_0.01_train = 0.8798302487697949
### Embedding_50_dropout_0_lr_0.01_test = 0.5186884366362529
-----------------------------------------------------
### Embedding_64_dropout_0_lr_0.01_train = 0.9257251429333502
### Embedding_64_dropout_0_lr_0.01_test = 0.6824379391862421