In [None]:
!pip install wandb
!pip install import-ipynb

In [None]:
# Imports
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import wandb
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

%cd '/content/gdrive/MyDrive/deepsf/code_JS'
import import_ipynb

# do_plots_model_results
from do_plots_model_results import val_loss_vs_epoch, ReturnPlotSolution, plot_pred_vs_real, corr_vs_biotype

# class_models_functions
from class_models_pytorch import evaluate, do_training, fit, DeepSF, DeepSF_2hidden 

In [None]:
path = '/content/gdrive/MyDrive/deepsf/code_JS/folder_rawdata_processing/'

# Read the data from the file.
with open(path+'3-pipeline_files.pkl', 'rb') as fid:
     result = pickle.load(fid)

TCGA_tpm_gn_RBPs = result['TCGA_tpm_gn_RBPs'] #pacientes x gene_expression SFs
TCGA_tpm_gn = result['TCGA_tpm_gn'] #pacientes x gene_expression total genes
TCGA_tpm_without_uniqueiso = result['TCGA_tpm_without_uniqueiso'] #pacientes x isoform_expression
getBM = result['getBM'] # index x (Transcript_ID, Gene_ID, Transcrip_name, Gene_name, Biotype)

TCGA_tpm_without_uniqueiso_log2p = np.log2(1+TCGA_tpm_without_uniqueiso) # Transformación Log2 a la matriz de isoformas.
getBM = getBM.iloc[[a in TCGA_tpm_without_uniqueiso_log2p.columns for a in getBM.Transcript_ID], :] # getBM real.

In [None]:
# Hyperparameters
batch_size=32
learning_rate=1e-1
if_toy = True
if_wandb = False
test_size = 0.2
num_genes = 100

In [None]:
toy_genes = list(getBM['Gene_name'][:num_genes]) #+ [ 'TP53'] 
getBM = getBM.iloc[[a in toy_genes for a in getBM.Gene_name], :] #getBM reducido.
getBM
  
toy_Transcript_ID = list(getBM.Transcript_ID)
TCGA_tpm_without_uniqueiso_log2p = TCGA_tpm_without_uniqueiso_log2p.loc[:, toy_Transcript_ID] 

  # Creamos el input 2: pacientes x expresión de los genes de cada una de las isoformas (de la matriz de genes total).
TCGA_tpm_gn_expr_each_iso = pd.DataFrame(np.zeros((TCGA_tpm_gn.shape[0], TCGA_tpm_without_uniqueiso_log2p.shape[1])), 
                                         index = TCGA_tpm_gn.index, columns = list(getBM.Gene_name))

for i in list(getBM.Gene_name): 
  TCGA_tpm_gn_expr_each_iso[i] = TCGA_tpm_gn.loc[:,i]
TCGA_tpm_gn_expr_each_iso.head() # pacientes x expresión de los genes de cada una de las isoformas.

  # Split in Training and Validation and Standarization of SFs expression 
df_train, df_validation = train_test_split(TCGA_tpm_gn_RBPs, test_size= test_size, random_state=0)

  # labels (we need the same patients so we use the same index selection)
train_labels = TCGA_tpm_without_uniqueiso_log2p.loc[df_train.index]
valid_labels = TCGA_tpm_without_uniqueiso_log2p.loc[df_validation.index]

  # gen_expr
train_gn =  TCGA_tpm_gn_expr_each_iso.loc[df_train.index]
valid_gn = TCGA_tpm_gn_expr_each_iso.loc[df_validation.index]

  # Scale the SF input data:
scaler_sfs = StandardScaler() #Initialize
scaler_sfs.fit(df_train) #We put the content inside the scaler. For each feature mean and std.

scaledTrain_df = pd.DataFrame(scaler_sfs.transform(df_train),index = df_train.index, columns = df_train.columns)
scaledValidation_df = pd.DataFrame(scaler_sfs.transform(df_validation),index = df_validation.index, columns = df_validation.columns)

  # Scale the gen_expr:
scale_gn = StandardScaler()
scale_gn.fit(train_gn)

scaled_train_gn = pd.DataFrame(scale_gn.transform(train_gn),index = train_gn.index, columns = train_gn.columns)
scaled_valid_gn = pd.DataFrame(scale_gn.transform(valid_gn),index = valid_gn.index, columns = valid_gn.columns)

# Convert to PyTorch dataset
train_ds = TensorDataset(torch.tensor(scaledTrain_df.values, dtype=torch.float32),
                         torch.tensor(train_labels.values, dtype=torch.float32),
                         torch.tensor(scaled_train_gn.values, dtype=torch.float32))

val_ds = TensorDataset(torch.tensor(scaledValidation_df.values, dtype=torch.float32),
                         torch.tensor(valid_labels.values, dtype=torch.float32),
                         torch.tensor(scaled_valid_gn.values, dtype=torch.float32))

train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size*2)

# Optimization of parameters with wandb

In [None]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
wandb.login()

config = dict(
      epochs = 1000,
      learning_rate = 1e-1, #[1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
      optimizer = 'sgd90', #['sgd90', 'sgd70', 'sgd50', 'adam', 'adagrad', 'adadelta', 'adamW',  'adamax', 'RMSProp']
      num_genes = num_genes)

# ESTE CONFIG PONERLO EN UN BUCLE EN EL RUN!!!
  
def build_optimizer(network, optimizer, learning_rate): #LEER SOBRE LOS OPTIMIZERS.
  if optimizer == 'sgd90':
    optimizer = torch.optim.SGD(network.parameters(), lr=learning_rate, momentum=0.9)
  elif optimizer == 'sgd70':
    optimizer = torch.optim.SGD(network.parameters(), lr=learning_rate, momentum=0.7)
  elif optimizer == 'sgd50':
    optimizer = torch.optim.SGD(network.parameters(), lr=learning_rate, momentum=0.5)
  elif optimizer == 'adam':
    optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
  elif optimizer == 'adagrad':
    optimizer = torch.optim.Adagrad(network.parameters(), lr=learning_rate)
  elif optimizer == 'adadelta':
    optimizer = torch.optim.Adadelta(network.parameters(), lr=learning_rate)
  elif optimizer == 'adamW':
    optimizer = torch.optim.AdamW(network.parameters(), lr=learning_rate) 
  elif optimizer == 'adamax':
    optimizer = torch.optim.Adamax(network.parameters(), lr=learning_rate) 
  elif optimizer == 'RMSProp':
    optimizer = torch.optim.RMSProp(network.parameters(), lr=learning_rate)
  return optimizer

def wandb_train(model, train_loader, val_loader, hyperparameters, name_project):
  
  # Tell wandb to get started
  with wandb.init(project=name_project, entity="deepsf", config = hyperparameters):
    config = wandb.config
    wandb.watch(model, criterion = F.mse_loss, log="all", log_freq=10)
    optimizer = build_optimizer(model, config.optimizer, config.learning_rate)

    for epoch in range(config.epochs): # METER EL VAL_LOSS EN EL .LOG
      # Training Phase 
      loss_train_epoch_end = do_training(model, train_loader, optimizer)
      # Validation phase
      loss_val_epoch_end = evaluate(model, val_loader)
      model.epoch_end(epoch, loss_train_epoch_end, loss_val_epoch_end)
       
      wandb.log({"loss": loss_train_epoch_end, "loss_val": loss_val_epoch_end, "epoch": epoch}) 
  
  # Save the model in the exchangeable ONNX format
  torch.onnx.export(model, "model.onnx")
  wandb.save("model.onnx")

# Model Linear with gene expression

In [None]:
model_linear = DeepSF(n_inputs=TCGA_tpm_gn_RBPs.shape[1], n_outputs=TCGA_tpm_without_uniqueiso_log2p.shape[1])
history = fit(1000, learning_rate, model_linear, train_loader, val_loader, opt_func=torch.optim.SGD)

# Plot results
val_loss_vs_epoch(history)
solution = plot_pred_vs_real(scaledTrain_df, train_labels, scaled_train_gn, model_linear) # training
print('cor_train:', solution.cor_total)
print('cor_trans_train:', solution.cor_trans)

solution2 = plot_pred_vs_real(scaledValidation_df, valid_labels, scaled_valid_gn, model_linear) # validation
print('cor_val:', solution2.cor_total)
print('cor_trans_val:', solution2.cor_trans)

corr_vs_biotype(getBM, train_labels, solution.cor_trans)
corr_vs_biotype(getBM, train_labels, solution2.cor_trans)

In [None]:
# Optimization with wandb   
model_linear = DeepSF(n_inputs=TCGA_tpm_gn_RBPs.shape[1], n_outputs=TCGA_tpm_without_uniqueiso_log2p.shape[1])
wandb_train(model_linear, train_loader, val_loader, config, name_project = "pytorch-demo")

# Model 2hidden layer with gene expression

In [None]:
model_2hidden = DeepSF_2hidden(n_inputs=TCGA_tpm_gn_RBPs.shape[1], n_outputs=TCGA_tpm_without_uniqueiso_log2p.shape[1])
history = fit(1000, learning_rate, model_2hidden, train_loader, val_loader, opt_func=torch.optim.SGD)

# Plot the results
val_loss_vs_epoch(history)

solution_2hidden = plot_pred_vs_real(scaledTrain_df, train_labels, scaled_train_gn, model_2hidden) # training
print('cor_train:', solution_2hidden.cor_total)
print('cor_trans_train:', solution_2hidden.cor_trans)

solution_val_2hidden = plot_pred_vs_real(scaledValidation_df, valid_labels, scaled_valid_gn, model_2hidden) # validation
print('cor_val:', solution_val_2hidden.cor_total)
print('cor_trans_val:', solution_val_2hidden.cor_trans)

corr_vs_biotype(getBM, train_labels, solution_2hidden.cor_trans)
corr_vs_biotype(getBM, train_labels, solution_val_2hidden.cor_trans)

In [None]:
# Boxplot comparar modelos:
corr_values = np.concatenate((solution2.cor_trans, solution_val_2hidden.cor_trans), axis=0)
model_name = np.concatenate((['Model_Linear' for i in solution2.cor_trans], ['Model_2hidden' for i in solution_val_2hidden.cor_trans]), axis=0)

data =  {'model':model_name,
           'corr': corr_values}
         
data_df = pd.DataFrame(data)
ax = sns.boxplot(x="model", y="corr",
                 data=data_df, palette="Set3")

In [None]:
# Optimization with wandb   
model_2hidden = DeepSF(n_inputs=TCGA_tpm_gn_RBPs.shape[1], n_outputs=TCGA_tpm_without_uniqueiso_log2p.shape[1])
wandb_train(model_2hidden, train_loader, val_loader, config, name_project = "pytorch-demo")