# Outline
This file contains scripts for the training and evaluation of the 'CNN Model'

# Training
First we import the needed libraries

In [1]:
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
from datasets import AASequenceDataset
from models import AASequenceModel
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score, auc
from training_and_evaluation import evaluate_model_CNN, train_model_CNN

## Loading the dataset and splitting it

First let us define a collate function for the dataloaders

In [2]:
def my_collate(x):
    '''
    Converts array<tuple> to tuple<array>.
    
    Paramaters
    ----------
    x : array<tuple>
    
    Returns
    -------
    tuple<array>
        Converted array of tuples
    '''
    return list(zip(*x))

In [3]:
sequence_dataset = AASequenceDataset('./data/merged_data_train.tsv', onehot_input=True, multihot_output=True)
slen = len(sequence_dataset) # Length of dataset

# Split the set into a validation and test set
val_set, train_set = random_split(sequence_dataset,
                                [int(0.2 * slen), int(slen - int(0.2 * slen))],
                                generator=torch.Generator().manual_seed(42))

# Defineing the dataloader for training and validation
dataloader = DataLoader(train_set, batch_size=300,
                        shuffle=True, num_workers=0, collate_fn=my_collate)
val_dataloader = DataLoader(val_set, batch_size=10,
                        shuffle=True, num_workers=0, collate_fn=my_collate)

## The training
First we prepare a couple of things for training

In [4]:
# Use cuda if cuda is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
writer = SummaryWriter() # Tensorboard
model = AASequenceModel().to(device) #Load the device
loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(51.86)).to(device)
opt = optim.Adam(model.parameters(), lr=0.001) # Optimizer
epoch_num = 5 # Number of epoch to train

We train the model and every certain number of steps we compute its performance on the validation set

In [None]:
train_model_CNN(model=model,
                dataloader=dataloader,
                val_dataloader=val_dataloader,
                epoch_num=epoch_num,
                opt=opt,
                loss_fn=loss_fn,
                device=device,
                file_path='./saved_models/CNN_onehot.nerf',
                my_collate=my_collate,
                writer = writer
               )

# Model evaluation

In [7]:
test_set = AASequenceDataset('./data/merged_data_test.tsv', onehot_input=True, multihot_output=True)

test_dataloader = DataLoader(test_set, batch_size=300,
                        shuffle=True, num_workers=0, collate_fn=my_collate)


In [None]:
evaluate_model_CNN(model,
                   test_dataloader,
                   device)