In [None]:
!poetry add ray #1.1.0

In [None]:
!poetry add ray[tune] 
#Using version ^1.1.0 for ray

In [4]:
#libraries
import pickle
from collections import Counter
import sys
import csv

# Libraries
import time
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torchtext
import numpy as np
# Preliminaries
from sklearn.model_selection import train_test_split
from torchtext.data import Field, TabularDataset, BucketIterator

# Models

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

# Training

import torch.optim as optim

# Evaluation
import sklearn
from sklearn.metrics import f1_score, precision_score, recall_score,fbeta_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error,mean_absolute_error

import seaborn as sns


In [7]:
import os
import numpy as np

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.schedulers import AsyncHyperBandScheduler


from functools import partial
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use("ggplot")

In [13]:
destination_folder='/data/'

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


  return torch._C._cuda_getDeviceCount() > 0


In [8]:
########## DEFINE MODEL ######################## 
class LSTM1(nn.Module):
     
    def __init__(self, num_embeddings=10, embedding_dim=110, hidden_dim=32,  n_layers=1, dropout=0.2,padding_idx='0'):
        #super().__init__()
        super(LSTM1, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx = padding_idx)
        #batch_size
        self.dimension = 32
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=n_layers,
                            batch_first=True,
                            bidirectional=True)#dropout=dropout if num_layers>1
        self.drop = nn.Dropout(p=dropout)
        #dense layer output from lstm bidirectional (batch_size, 2*batch_size)
        self.linear = nn.Linear(2*hidden_dim, 6) 

    def forward(self, text, text_len):
        #we pad the sequence for lst through embedding layer. In alternative we need to pad the sequence pad_packed_sequence
        text_emb = self.drop(self.embedding(text))
        #lstm_out, (ht, ct) = self.lstm(text_emb)
        packed_input = pack_padded_sequence(text_emb, text_len.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (ht, ct) = self.lstm(packed_input)

        #unpack the output from lstm [dimension, 1, 2*dimension]      
        #output_lengths  is the length of each sentence
        output_padded, output_lengths= pad_packed_sequence(packed_output, batch_first=True)
       
        #get the foward and reverse lstm outputs, concatenate both of them and pass it to the fully connected layer.
        out_forward = output_padded[range(len(output_padded)), text_len - 1, :self.dimension]
        out_reverse = output_padded[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)

        text_fea = self.drop(out_reduced)
        
        text_fea = self.linear(text_fea)
        #remove size 1 [dimension,  #classes] 
        text_fea = torch.squeeze(text_fea, 1)
        
        
        #text_out=self.linear(ht[-1])

        #text_out=torch.nn.LogSoftmax(text_fea)
        return text_fea


In [9]:
def validate(model,criterion ,valid_loader,device):
    valid_running_loss = 0.0    
    correct = 0
    total = 0

    print('validate model')
    model.eval()
  
    with torch.no_grad():                    
        # validation loop
        for (label, (text, text_len)), _ in valid_loader:
            if torch.cuda.is_available():
                    label = label.to(device)
                    text = text.to(device)
                    text_len = text_len.to(device)
                    output = model(text, text_len)
                    
                    loss_f = nn.CrossEntropyLoss()
                    loss=loss_f(output, label.long())

                    #val loss
                    valid_running_loss += loss.item()
                    
                    pred = torch.max(output, 1)[1]
                    correct += (pred == label).float().sum()
                    total += label.shape[0]

    return valid_running_loss,correct/total

In [10]:
# tune run in parallel so I need to load dataset and prepare the fields inside the same function
def train_model_tuning(config, checkpoint_dir=None, data_dir=None):
    
    # initialize running values
    print('train_model')
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    file_path = destination_folder,
    best_valid_loss = float("Inf")

    # Fields for encoding
    tokenize = lambda x: x.split(' ')
    text_field = Field(sequential=True, tokenize=tokenize,lower=False, include_lengths=True, batch_first=True,pad_token='O')
    label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)

    fields = [ ('label', label_field),('sequence_splitted', text_field)]
    #for Colab
    csv.field_size_limit(sys.maxsize)
    
    # TabularDataset                              
    train, valid, test = TabularDataset.splits(path='/content/drive/MyDrive/gene_calling/', train='train2.csv', validation='valid2.csv', test='test2.csv',
                                                format='CSV', fields=fields, skip_header=True)
    # Vocabulary
    text_field.build_vocab(train)

    # Iterators
    train_loader = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.sequence_splitted),
                            device=device, sort=True, sort_within_batch=True)
    valid_loader = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.sequence_splitted),
                            device=device, sort=True, sort_within_batch=True)


    model = LSTM1(num_embeddings= len(text_field.vocab), embedding_dim=config["embedding_dim"], hidden_dim=config["hidden_dim"], n_layers=config["n_layers"], dropout=config["dropout"],padding_idx = text_field.vocab.stoi[text_field.pad_token]).to(device)
    #model = LSTM(config['embedding_dim'], config["hidden_dim"], config["n_layers"], config["dropout"] ).to(device)
    

    #if torch.cuda.is_available():
    #    device = "cuda:0"
    #    if torch.cuda.device_count() > 1:
    #        model = nn.DataParallel(model)
    #model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    if checkpoint_dir:
        print('sono nel checkpoint')
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    for epoch in range(config["num_epochs"]):
        print('EPOCH', epoch)
        
        #TRAINING
        #train_loss,global_step,train_acc = train(model,  optimizer, criterion,train_loader)
        running_loss = 0.0 
        global_step = 0 
        total = 0.0
        correct = 0.0
        print('start training')

        model.train()   
    
        for (label,(text, text_len)), _  in train_loader: 
          
          if torch.cuda.is_available():
            label = label.to(device)
            text = text.to(device)
            text_len = text_len.to(device)
            output = model(text, text_len)
    
            loss_f = nn.CrossEntropyLoss()
            loss=loss_f(output, label.long())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update train loss
            running_loss += loss.item()
            #train accuracy
            pred = torch.max(output, 1)[1]
            correct += (pred == label).float().sum()
            total += label.shape[0]
            
            # evaluate it only for few steps
            global_step += 1 
        
        train_loss=running_loss
        
        train_acc=correct/total

        #VALIDATION
        val_loss, val_acc = validate(model, criterion, valid_loader, device)
         
        # evaluation
        average_train_loss = train_loss / len(train_loader) 
        average_valid_loss = val_loss / len(valid_loader)
        train_loss_list.append(average_train_loss)
        valid_loss_list.append(average_valid_loss)
        global_steps_list.append(global_step)

        
        
        # checkpoint
        if best_valid_loss > average_valid_loss:
            best_valid_loss = average_valid_loss
            print(f'Best validation loss!! {best_valid_loss}')
            
        
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)
        print('TUNE REPORT')
        tune.report(loss=average_valid_loss, accuracy=val_acc)

        # resetting running values
        train_loss = 0.0                
        val_loss = 0.0
        global_step = 0.0 
        val_acc = 0.0 
       

        model.train()

In [None]:
# Run 10 trials (each trial is one instance of a Trainable). Tune runs
# in parallel and automatically determines concurrency.
data_dir = os.path.abspath("./content/drive/MyDrive/gene_calling")

config = {
        "embedding_dim" : tune.grid_search([100,120,110]),#150
        "hidden_dim": tune.grid_search([32]),#32
        "n_layers": tune.grid_search([1,2]),
        "num_epochs": tune.grid_search([15]),
        #"lr": tune.loguniform(1e-3, 1e-2),  choice  0.5e-3,1e-3, 0.5e-2, 1e-2
        "lr": tune.grid_search([1e-3, 0.5e-2, 0.5e-3]),
        "dropout" : tune.grid_search([0.3, 0.4])
    }    
scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=15,
        grace_period=1,
        reduction_factor=4)
reporter = CLIReporter(
         #parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
result = tune.run(
        partial(train_model_tuning, data_dir=destination_folder),
        resources_per_trial={ "cpu": 2,"gpu": 1},
        config=config,
        num_samples=1,#-1 infinite
        scheduler=scheduler,
        progress_reporter=reporter)

#choice randomly choose a value
#grid_search try each combination of parameter

In [None]:
ray.shutdown()  # Restart Ray defensively in case the ray connection is lost. 
#ray.init(log_to_driver=False, dashboard_port=8265)

In [None]:
best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

In [16]:
# Fields for encoding
tokenize = lambda x: x.split(' ')
text_field = Field(sequential=True, tokenize=tokenize,lower=False, include_lengths=True, batch_first=True,pad_token='O')
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)

fields = [ ('label', label_field),('sequence_splitted', text_field)]
csv.field_size_limit(sys.maxsize)
    # TabularDataset
    #csv.field_size_limit=2147483646                               
train, valid, test = TabularDataset.splits(path='./data/', train='train2.csv', validation='valid2.csv', test='test2.csv',
                                                format='CSV', fields=fields, skip_header=True)
    # Vocabulary
text_field.build_vocab(train)
test_iter = BucketIterator(test, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False)




In [None]:
best_trained_model = LSTM1(len(text_field.vocab),best_trial.config["embedding_dim"], 
                           best_trial.config["hidden_dim"],
                           best_trial.config["n_layers"], best_trial.config["dropout"],
                           text_field.vocab.stoi[text_field.pad_token]).to(device)
  

best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)

### Tensorboard

In [17]:
%load_ext tensorboard

In [None]:
tensorboard --logdir ~/ray_results/

In [None]:
# Run 10 trials (each trial is one instance of a Trainable). Tune runs
# in parallel and automatically determines concurrency.
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./content/drive/MyDrive/gene_calling")
    
    config = {
        "embedding_dim" : tune.choice([100]),#150
        "hidden_dim": tune.choice([  16]),#32
        "n_layers": tune.choice([1,2]),
        "num_epochs": tune.grid_search([ 5,10]),
        "lr": tune.choice([1e-3, 1e-1]),
        "dropout" : tune.choice([0.3, 0.5])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="max",
        max_t=max_num_epochs,
        grace_period=10,
        reduction_factor=2)
    reporter = CLIReporter(
         parameter_columns=["embedding_dim", "hidden_dim", "n_layers", "num_epochs", "lr", "dropout"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_model_tuning, data_dir=destination_folder),
        resources_per_trial={ "cpu": 2,"gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        #scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = model(best_trial.config["embedding_dim"], best_trial.config["hidden_dim"],
                               best_trial.config["emben_layersdding_dim"], best_trial.config["num_epochs"],
                               best_trial.config["lr"],best_trial.config["dropout"]).to(device)
  

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=1)