**Ensemble Learning**
- **CNN**.
- **RNN** implemented using a *LSTM*.

It can uses also a *random seed* to shuffle the data and use a different *training*, *validation* and *test* set respect to the ones used by ***ViraMiner***.

In [None]:
"""
Main libraries
"""

import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import tensorflow as tf

import random
import gc

In [None]:
"""
For SMOTE
"""
from imblearn.over_sampling import SMOTE

In [None]:
"""
For the class weight
"""
from sklearn.utils.class_weight import compute_class_weight

In [None]:
"""
For the ensamble learning
"""
from sklearn.svm import SVC

In [None]:
"""
Only for the metrics analysis
"""
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


The **Onehot Encoding** that transform a *DNA sequence* to a vector of zeros and ones. It is useful expectially for the **CNN**.

In [None]:
"""
Functions for the onehot encoding
"""

def onehot_encoder(dataset):
    """
    Function that encodes a DNA dataset into a onehot encoding dataset.
    """
    onehot_dataset = [dna_onehot_encoder(dna_string) for dna_string in dataset]
    onehot_dataset_numpy = np.array(onehot_dataset)

    return onehot_dataset_numpy


def dna_onehot_encoder(dna_sequence):
    """
    Function that encodes a single DNA string into a onehot encoding string.
    """
    onehot_dict = {
        'A' : [1, 0, 0, 0],
        'C' : [0, 1, 0, 0],
        'G' : [0, 0, 1, 0],
        'T' : [0, 0, 0, 1]
    }
    encoder = [onehot_dict[nuc] for nuc in dna_sequence]

    return encoder

In [None]:
def sequence_to_numeric(sequence):
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    return [mapping[nuc] for nuc in sequence]

The **CNN** model: direct from the file *CNN.ipynb* avaiable in the GitHub repository.

In [None]:
"""
!- CNN Model
"""

class ConvNet(nn.Module):
    # We can use a differnet pool for each layer
    def __init__(self):
        super(ConvNet, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv1d(300, 200, kernel_size=2, padding=1),
            nn.BatchNorm1d(200),
            nn.PReLU(),
            nn.AvgPool1d(2),
            nn.Dropout1d(0.45)
        )

        # I remove random connection to help the convergency
        self.drop_out = nn.Dropout()

        self.layer2 = nn.Sequential(
            nn.Conv1d(200, 100, kernel_size=2, padding=1),
            nn.BatchNorm1d(100),
            nn.PReLU(),
            nn.AvgPool1d(2),
            nn.Dropout1d(0.45)
        )

        # I remove random connection to help the convergency
        self.drop_out_1 = nn.Dropout()

        self.layer3 = nn.Sequential(
            nn.Conv1d(100, 75, kernel_size=2, padding=1),
            nn.BatchNorm1d(75),
            nn.PReLU(),
            nn.MaxPool1d(2),
            nn.Dropout1d(0.3)
        )

        self.layer4 = nn.Sequential(
            nn.Conv1d(75, 50, kernel_size = 2, padding = 1),
            nn.BatchNorm1d(50),
            nn.PReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(0.3)
        )


        self.layer5 = nn.Sequential(
            nn.Conv1d(50, 32, kernel_size=2, padding=1),
            nn.BatchNorm1d(32),
            nn.PReLU(),
            nn.MaxPool1d(2),
            nn.Dropout1d(0.3)
        )

        # I remove random connection to help the convergency
        self.drop_out_2 = nn.Dropout()

        self.linear1 = nn.Linear(32, 128)
        self.linear2 = nn.Linear(128, 2)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x=self.drop_out_1(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x=self.drop_out_2(x)
        # Flatten the output for the linear layer
        x = x.view(x.size(0), -1)
        x = self.linear1(x)
        x = self.linear2(x)
        return x

The **RNN** model: direct from the file *LSTM.ipynb* avaiable in the GitHub repository. We choose the *LSTM* over the *GRU* because it obtained an **higher** precision on the dataset and also is **less** computationally demanding.

In [None]:
class LSTM_Net (nn.Module):
    def __init__ (self):
        super(LSTM_Net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=32, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)

        self.lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=6, batch_first=True, dropout=0.5, bidirectional=True)
        # self.lstm = nn.LSTM(input_size=4, hidden_size=128, num_layers=2, dropout=0.5, batch_first=True)

        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 2)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))

        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)

        x = x[:, -1, :]
        x = x.contiguous().view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)

        return x

**MAIN**

Training phase

Extraction of the data from the *.cvc* files and **oversample** data from class 1.

In [None]:
"""
!- MAIN
"""

# Set the device to be used (GPU or CPU)
device = th.device("cuda" if th.cuda.is_available() else "cpu")
print("Device: ", device)

# Read the input from the cvc file
"""
If runs in local machine:

data_dir = os.path.abspath(os.path.join(os.getcwd(), 'data'))
train_data_path = os.path.join(data_dir, 'fullset_train.csv')
val_data_path = os.path.join(data_dir, 'fullset_validation.csv')
test_data_path = os.path.join(data_dir, 'fullset_test.csv')
"""

"""
If runs in Google Drive

rel_path_train = '/content/drive/MyDrive/Colab Notebooks/fullset_train.csv'
rel_path_val = '/content/drive/MyDrive/Colab Notebooks/fullset_validation.csv'
rel_path_test = '/content/drive/MyDrive/Colab Notebooks/fullset_test.csv'
"""

"""
If runs on Colab

rel_path_train = '/content/fullset_train.csv'
rel_path_val = '/content/fullset_validation.csv'
rel_path_test = '/content/fullset_test.csv'
"""

# Paste here your path to the datasets
rel_path_train = '/content/drive/MyDrive/Colab Notebooks/fullset_train.csv'
rel_path_val = '/content/drive/MyDrive/Colab Notebooks/fullset_validation.csv'
rel_path_test = '/content/drive/MyDrive/Colab Notebooks/fullset_test.csv'

# Training Set

# Read the input from the csv file
train_csv = pd.read_csv(rel_path_train, sep=",")
# Drop the NaN values
train_csv = train_csv.dropna()
# Describe the data
print(train_csv.describe())

# Get the data from the csv file
train_data = train_csv.values
# m = number of input samples
m = train_data.shape[0]

print("Start SMOTE")

# Dataframe and upsample
data = {'sequence' : train_data[:m,1],
        'label' : train_data[:m,2].astype(np.int32) }

df = pd.DataFrame(data)

X_train = df['sequence'].values
Y_train = df['label'].values

#Convert the sequences to numeric
numeric_train = [sequence_to_numeric(seq) for seq in X_train]
numeric_train = np.array(numeric_train)

# Reshape the data
n_samples, _ = numeric_train.shape
numeric_train_sequences = numeric_train.reshape((n_samples, -1))

# Apply the SMOTE algorithm to balance the dataset
smote = SMOTE()
X_train, Y_train = smote.fit_resample(numeric_train_sequences, Y_train)

#Convert the sequences to string
X_train = [''.join([['A', 'C', 'G', 'T'][nuc] for nuc in seq]) for seq in X_train]

print("End of SMOTE")

"""
Draft for oversample

count_class_0, count_class_1 = df['label'].value_counts()

df_class_0 = df[df['label'] == 0]
df_class_1 = df[df['label'] == 1]

# Oversample data from class 1 x10 -> ratio between class 1 and class 0 = 1/5
n_samples = count_class_0 / 5

print("Samples class 0: ", count_class_0)
print("Samples class 1: ", count_class_1)
print("Oversamples: ", int(n_samples))

df_class_1_oversampled = df_class_1.sample(int(n_samples), replace=True, random_state=42)
df_balanced = pd.concat([df_class_0, df_class_1_oversampled], axis=0)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract data
X_train = df_balanced['sequence'].values
Y_train = df_balanced['label'].values
"""

# OneHot encoding for the training data
print("Start onehot encoding for the training data")
X_train = onehot_encoder(X_train)

# Convert the data to a tensor
X_train = th.from_numpy(X_train).to(device)
Y_train = th.tensor(Y_train).to(device)

print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)

# Free memory
del train_csv, train_data, m
gc.collect()
th.cuda.empty_cache()

# Validation Set

# Read the input from the csv file
val_csv = pd.read_csv(rel_path_val, sep=",")
# Drop the NaN values
val_csv = val_csv.dropna()
# Describe the data
print(val_csv.describe())

val_data = val_csv.values
# m = number of input samples
m = val_data.shape[0]

X_val = val_data[:m,1]
Y_val = val_data[:m,2].astype(np.int32)

# OneHot encoding for the validation data
print("Start onehot encoding for the validation data")
X_val = onehot_encoder(X_val)

X_val = th.from_numpy(X_val).to(device)
Y_val = th.tensor(Y_val).to(device)

print("X_val shape", X_val.shape)
print("Y_val shape", Y_val.shape)

# Free memory
del val_csv, val_data, m
gc.collect()
th.cuda.empty_cache()

# Test

# Read the input from the csv file
test_csv = pd.read_csv(rel_path_test, sep=",")
# Drop the NaN values
test_csv = test_csv.dropna()
# Describe the data
print(test_csv.describe())

test_data = test_csv.values
# m = number of input samples
m = test_data.shape[0]

X_test = test_data[:m,1]
Y_test = test_data[:m,2].astype(np.int32)

# OneHot encoding for the test data
print("Start onehot encoding for the test data")
X_test = onehot_encoder(X_test)

X_test = th.from_numpy(X_test).to(device)
Y_test = th.tensor(Y_test).to(device)

print("X_test shape", X_test.shape)
print("Y_test shape", Y_test.shape)

# Free memory
del test_csv, test_data, m
gc.collect()
th.cuda.empty_cache()

Device:  cuda
                   0
count  211238.000000
mean        0.021142
std         0.143858
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Start SMOTE
End of SMOTE
Start onehot encoding for the training data
X_train shape:  torch.Size([413544, 300, 4])
Y_train shape:  torch.Size([413544])
                  0
count  26404.000000
mean       0.020224
std        0.140769
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Start onehot encoding for the validation data
X_val shape torch.Size([26404, 300, 4])
Y_val shape torch.Size([26404])
                  0
count  26404.000000
mean       0.020868
std        0.142945
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Start onehot encoding for the test data
X_test shape torch.Size([26404, 300, 4])
Y_test shape torch.Size([26404])


**Shuffle of the data** *(optional)* using a random seed and split data:
- **Train Set:** around the 80% of the total samples.
- **Validation Set:** around 10% of the total samples.
- **Test Set:** around 10% of the total samples.

In [None]:
"""
I merge the three tensor array to a big one and then, after a shuffle, I split the data into:
  - Training: 248126 data
  - Validation: 31034 data
  - Test: 31033 data

  Random seed: 42 (the most used)
"""

"""
If we use SMOTE we do not use random seed!!


# Merge the arrays

print("Shape of X_train: ", X_train.shape)
print("Shape of X_val: ", X_val.shape)
print("Shape of X_test: ", X_test.shape)

X_data = th.concat((X_train, X_val), axis = 0).to(device)
X_data = th.concat((X_data, X_test), axis = 0).to(device)

print("Shape of data: ", X_data.shape)

# Free memory
del X_train, X_val, X_test
gc.collect()
th.cuda.empty_cache()

print("Shape of Y_train: ", Y_train.shape)
print("Shape of Y_val: ", Y_val.shape)
print("Shape of Y_test: ", Y_test.shape)

Y_data = th.concat((Y_train, Y_val), axis = 0).to(device)
Y_data = th.concat((Y_data, Y_test), axis = 0).to(device)

print("Shape of data: ", Y_data.shape)

# Class weight
classes_unique = np.unique(Y_data.cpu())
class_weights = compute_class_weight(class_weight='balanced', classes=classes_unique, y=[0,1])
class_weights = th.tensor(class_weights, dtype=th.float32)

# Free memory
del Y_train, Y_val, Y_test
gc.collect()
th.cuda.empty_cache()

# Random Seed
random_seed = 42
th.manual_seed(random_seed)

X_index_shuffle = th.randperm(X_data.size(0))
X_data_shuffled = X_data[X_index_shuffle]

Y_index_shuffle = th.randperm(Y_data.size(0))
Y_data_shuffled = Y_data[Y_index_shuffle]

print("X_data_shuffled shape: ", X_data_shuffled.shape)
print("Y_data_shuffled shape: ", Y_data_shuffled.shape)

# Split data into Training, Validation and Test
X_train = X_data_shuffled[148126:248126].cpu()
Y_train = Y_data_shuffled[148126:248126].cpu()
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)

X_val = X_data_shuffled[211238:279160].cpu()
Y_val = Y_data_shuffled[211238:279160].cpu()
print("X_val shape: ", X_val.shape)
print("Y_val shape: ", Y_val.shape)

X_test = X_data_shuffled[279160:].cpu()
Y_test = Y_data_shuffled[279160:].cpu()
print("X_test shape: ", X_test.shape)
print("Y_test shape: ", Y_test.shape)
"""

'\nIf we use SMOTE we do not use random seed!!\n\n\n# Merge the arrays\n\nprint("Shape of X_train: ", X_train.shape)\nprint("Shape of X_val: ", X_val.shape)\nprint("Shape of X_test: ", X_test.shape)\n\nX_data = th.concat((X_train, X_val), axis = 0).to(device)\nX_data = th.concat((X_data, X_test), axis = 0).to(device)\n\nprint("Shape of data: ", X_data.shape)\n\n# Free memory\ndel X_train, X_val, X_test\ngc.collect()\nth.cuda.empty_cache()\n\nprint("Shape of Y_train: ", Y_train.shape)\nprint("Shape of Y_val: ", Y_val.shape)\nprint("Shape of Y_test: ", Y_test.shape)\n\nY_data = th.concat((Y_train, Y_val), axis = 0).to(device)\nY_data = th.concat((Y_data, Y_test), axis = 0).to(device)\n\nprint("Shape of data: ", Y_data.shape)\n\n# Class weight\nclasses_unique = np.unique(Y_data.cpu())\nclass_weights = compute_class_weight(class_weight=\'balanced\', classes=classes_unique, y=[0,1])\nclass_weights = th.tensor(class_weights, dtype=th.float32)\n\n# Free memory\ndel Y_train, Y_val, Y_test\ngc.

**Weight class**

In [None]:
"""
Function to calculate the weight for the two classes for
"""
def get_class_weights(dataset):
    labels = [label for label in dataset]
    class_counts = np.bincount(labels)
    total_samples = len(labels)
    class_weights = total_samples / (len(class_counts) * class_counts)
    return th.tensor(class_weights, dtype=th.float)

# Calculate the class weights for the X_train
class_weights = get_class_weights(Y_train.cpu()).to(device)

print("Class weight: ", class_weights)

Class weight:  tensor([1., 1.], device='cuda:0')


In [None]:
"""
Class to create a Data Loader with X label and Y label together
"""
class CreateDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_point = self.data[idx]
        label = self.labels[idx]
        return data_point, label

# Create the Dataset
train_dataset = CreateDataset(X_train, Y_train)
val_dataset = CreateDataset(X_val, Y_val)
test_dataset = CreateDataset(X_test, Y_test)

# Batch size
batch_dim = 128

# Create the Data Loader
train_loader = DataLoader(train_dataset, batch_size=batch_dim, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_dim, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_dim, shuffle=True)

# Free memory
del X_train, X_val, Y_val
gc.collect()
th.cuda.empty_cache()

*Train* of the **CNN** using the *Validation* set to check the performance.

In [None]:
print("Start training the CNN")

model_CNN = ConvNet().to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = th.optim.AdamW(model_CNN.parameters(), lr=0.001)
scheduler = th.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

# Early stopping parameters
patience = 5  # CHECK THE VALUE!!
best_val_loss = float('inf')
counter = 0

# Training the model
num_epochs = 100  # Hope to reach convergence before 100 epoches

for epoch in range(num_epochs):
    model_CNN.train()
    running_loss = 0.0
    i = 0
    for X_batch, Y_batch in train_loader:
        X_batch = X_batch.float()
        Y_batch = Y_batch.long()

        # Forward pass
        outputs = model_CNN(X_batch.to(device))
        loss = criterion(outputs, Y_batch.to(device))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        i += 1

        # Free memory
        del X_batch, Y_batch
        th.cuda.empty_cache()

    # Validation the model
    model_CNN.eval()
    val_loss = 0.0
    j = 0
    with th.no_grad():
        for X_batch, Y_batch in val_loader:
            X_batch = X_batch.float()
            Y_batch = Y_batch.long()

            outputs = model_CNN(X_batch.to(device))
            loss = criterion(outputs, Y_batch.to(device))

            val_loss += loss.item()
            j += 1

            # Free memory
            del X_batch, Y_batch
            th.cuda.empty_cache()

    # Losses
    running_loss = running_loss/i
    val_loss = val_loss/j

    scheduler.step(val_loss)

    # Check for early stopping
    if val_loss <= best_val_loss:
        best_val_loss = val_loss
        counter = 0
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss:.4f}, Val Loss: {val_loss:.4f}, Patience: {patience-counter}')
    else:
        counter += 1
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss:.4f}, Val Loss: {val_loss:.4f}, Patience: {patience-counter}')
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

print("End training the CNN")

# Free memory
del i, j, running_loss, val_loss
gc.collect()
th.cuda.empty_cache()

Start training the CNN


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/100], Train Loss: 0.1268, Val Loss: 0.1121, Patience: 5
Epoch [2/100], Train Loss: 0.0925, Val Loss: 0.1121, Patience: 4
Epoch [3/100], Train Loss: 0.0763, Val Loss: 0.1171, Patience: 3
Epoch [4/100], Train Loss: 0.0678, Val Loss: 0.1077, Patience: 5
Epoch [5/100], Train Loss: 0.0631, Val Loss: 0.1074, Patience: 5
Epoch [6/100], Train Loss: 0.0603, Val Loss: 0.1061, Patience: 5
Epoch [7/100], Train Loss: 0.0584, Val Loss: 0.1115, Patience: 4
Epoch [8/100], Train Loss: 0.0569, Val Loss: 0.1112, Patience: 3
Epoch [9/100], Train Loss: 0.0556, Val Loss: 0.1133, Patience: 2
Epoch [10/100], Train Loss: 0.0545, Val Loss: 0.1094, Patience: 1
Epoch [11/100], Train Loss: 0.0501, Val Loss: 0.1114, Patience: 0
Early stopping at epoch 11
End training the CNN


**(OPTIONAL):** *Test* results for the **CNN**.

In [None]:
print("Start testing the CNN")

# Model ready for the evaluation
model_CNN.eval()

# Arrays to save the results
all_preds = []
all_labels = []

# Testing the model
with th.no_grad():
    for X_batch, Y_batch in test_loader:
        X_batch = X_batch.to(device).float()
        Y_batch = Y_batch.to(device).long()

        outputs = model_CNN(X_batch)
        _, predicted = th.max(outputs, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(Y_batch.cpu().numpy())

        # Free memory
        del X_batch, Y_batch
        th.cuda.empty_cache()

# Convert the tensor to use scikit learn metrics
y_true_CNN = all_labels
y_pred_CNN = all_preds

# Metrics
accuracy_CNN = accuracy_score(y_true_CNN, y_pred_CNN)
precision_CNN = precision_score(y_true_CNN, y_pred_CNN, average='weighted', zero_division=0)
recall_CNN = recall_score(y_true_CNN, y_pred_CNN, average='weighted')
f1_CNN = f1_score(y_true_CNN, y_pred_CNN, average='weighted')

print(f'Accuracy: {accuracy_CNN:.6f}')
print(f'Precision: {precision_CNN:.6f}')
print(f'Recall: {recall_CNN:.6f}')
print(f'F1-score: {f1_CNN:.6f}')

print("End testing the CNN")

# Free memory
del all_labels, all_preds, y_true_CNN, y_pred_CNN, accuracy_CNN, precision_CNN, recall_CNN, f1_CNN
gc.collect()
th.cuda.empty_cache()

Start testing the CNN
Accuracy: 0.977200
Precision: 0.962646
Recall: 0.977200
F1-score: 0.968700
End testing the CNN


*Train* of the **RNN** using the *Validation* set to check the performance.

In [None]:
print("Start training the LSTM")

# Model, loss function and optimizer
model_LSTM = LSTM_Net().to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = th.optim.Adam(model_LSTM.parameters(), lr=0.001)

# Early stopping parameters
patience = 5  # CHECK THE VALUE!!
best_val_loss = float('inf')
counter = 0

# Training the model
num_epochs = 100  # Hope to reach convergence before 100 epoches

for epoch in range(num_epochs):
    model_LSTM.train()
    running_loss = 0.0
    i = 0
    for X_batch, Y_batch in train_loader:

        X_batch = X_batch.to(device).float()
        Y_batch = Y_batch.to(device).long()

        # Forward pass
        outputs = model_LSTM(X_batch.to(device))
        loss = criterion(outputs, Y_batch.to(device))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        i += 1

        # Free memory
        del X_batch, Y_batch
        th.cuda.empty_cache()

    # Validation
    model_LSTM.eval()
    val_loss = 0.0
    j = 0
    with th.no_grad():
        for X_batch, Y_batch in val_loader:
            X_batch = X_batch.to(device).float()
            Y_batch = Y_batch.to(device).long()

            outputs = model_LSTM(X_batch)
            loss = criterion(outputs, Y_batch)

            val_loss += loss.item()
            j += 1

            # Free memory
            del X_batch, Y_batch
            th.cuda.empty_cache()

    # Losses
    running_loss = running_loss/i
    val_loss = val_loss/j

    # Check for early stopping
    if val_loss <= best_val_loss:
        best_val_loss = val_loss
        counter = 0
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss:.4f}, Val Loss: {val_loss:.4f}, Patience: {patience-counter}')
    else:
        counter += 1
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss:.4f}, Val Loss: {val_loss:.4f}, Patience: {patience-counter}')
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

print("End training the LSTM")

# Free memory
del i, j, running_loss, val_loss
gc.collect()
th.cuda.empty_cache()

Start training the LSTM
Epoch [1/100], Train Loss: 0.2236, Val Loss: 0.1288, Patience: 5
Epoch [2/100], Train Loss: 0.1167, Val Loss: 0.1556, Patience: 4
Epoch [3/100], Train Loss: 0.1070, Val Loss: 0.1266, Patience: 5
Epoch [4/100], Train Loss: 0.0949, Val Loss: 0.1039, Patience: 5
Epoch [5/100], Train Loss: 0.0797, Val Loss: 0.1483, Patience: 4
Epoch [6/100], Train Loss: 0.0699, Val Loss: 0.1062, Patience: 3
Epoch [7/100], Train Loss: 0.0634, Val Loss: 0.0998, Patience: 5
Epoch [8/100], Train Loss: 0.0593, Val Loss: 0.1042, Patience: 4
Epoch [9/100], Train Loss: 0.0562, Val Loss: 0.0976, Patience: 5
Epoch [10/100], Train Loss: 0.0534, Val Loss: 0.1039, Patience: 4
Epoch [11/100], Train Loss: 0.0534, Val Loss: 0.0941, Patience: 5
Epoch [12/100], Train Loss: 0.0486, Val Loss: 0.0953, Patience: 4
Epoch [13/100], Train Loss: 0.0472, Val Loss: 0.0917, Patience: 5
Epoch [14/100], Train Loss: 0.0460, Val Loss: 0.0912, Patience: 5
Epoch [15/100], Train Loss: 0.0439, Val Loss: 0.1026, Patienc

**(OPTIONAL):** *Test* results forthe **RNN**.

In [None]:
print("Start testing the LSTM")

# Model ready for the evaluation
model_LSTM.eval()

# Arrays to save the results
all_preds = []
all_labels = []

# Testing the model
with th.no_grad():
    for X_batch, Y_batch in test_loader:
        X_batch = X_batch.to(device).float()
        Y_batch = Y_batch.to(device).long()

        outputs = model_LSTM(X_batch)
        _, predicted = th.max(outputs, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(Y_batch.cpu().numpy())

# Convert the tensor to use scikit learn metrics
y_true_LSTM = all_labels
y_pred_LSTM = all_preds

# Metrics
accuracy_LSTM = accuracy_score(y_true_LSTM, y_pred_LSTM)
precision_LSTM = precision_score(y_true_LSTM, y_pred_LSTM, average='weighted', zero_division=0)
recall_LSTM = recall_score(y_true_LSTM, y_pred_LSTM, average='weighted')
f1_LSTM = f1_score(y_true_LSTM, y_pred_LSTM, average='weighted')

print(f'Accuracy: {accuracy_LSTM:.6f}')
print(f'Precision: {precision_LSTM:.6f}')
print(f'Recall: {recall_LSTM:.6f}')
print(f'F1-score: {f1_LSTM:.6f}')

print("End testing the LSTM")

# Free memory
del all_labels, all_preds, y_true_LSTM, y_pred_LSTM, accuracy_LSTM, precision_LSTM, recall_LSTM, f1_LSTM
gc.collect()
th.cuda.empty_cache()

Start testing the LSTM
Accuracy: 0.973981
Precision: 0.965949
Recall: 0.973981
F1-score: 0.969472
End testing the LSTM


Implementation of the **Ensemble Learning** using an **SVM** (stacking ensemble learing).

In [None]:
"""
An SVM takes in input the results from the CNN and RNN and then it preforms the final classification.

I have to use skitlearn to do the ensable learing because the library is only avaiable on it!
"""

"""
Function that, given a model and a data loader, it returns all the predictions (outputs)
"""
def get_predictions(model_pred, data_loader):
    model_pred.eval()
    all_preds = []
    with th.no_grad():
        for inputs, _ in data_loader:
            outputs = model_pred(inputs.float().to(device))
            all_preds.append(outputs.cpu().numpy())
    return np.concatenate(all_preds)

# Prepare the prediction for the training model
CNN_train_preds = get_predictions(model_CNN, train_loader)
LSTM_train_preds = get_predictions(model_LSTM, train_loader)

# Prepare the prediction for the testing model
CNN_test_preds = get_predictions(model_CNN, test_loader)
LSTM_test_preds = get_predictions(model_LSTM, test_loader)

# Concatenate the two inputs
X_train_meta = np.hstack((CNN_train_preds, LSTM_train_preds))

print("Shape of X_train_meta: ", X_train_meta.shape)
print("Shape of Y_train: ", Y_train.shape)

# Define the train loader
train_dataset = CreateDataset(X_train_meta, Y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_dim, shuffle=True)

# Define the test loader
test_dataset = CreateDataset(X_test, Y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_dim, shuffle=True)

# Free memory
gc.collect()
th.cuda.empty_cache()

Shape of X_train_meta:  (413544, 4)
Shape of Y_train:  torch.Size([413544])


In [None]:
# We use a polynomial SVM for the final classification
model_SVM = SVC(kernel = 'poly', probability = True)

# Train the model
for X_batch, Y_batch in train_loader:
    X_batch = X_batch.cpu()
    Y_batch = Y_batch.cpu()
    model_SVM.fit(X_batch, Y_batch.ravel())

**Test** of the performance.

In [None]:
print("Start testing the final model - SVM")

y_true = []
y_pred = []
y_pred_prob = []

# Testing the meta-model: SVM
# y_pred = model_SVM.predict(X_test.cpu())
# y_pred_prob = model_SVM.predict_proba(X_test)[:, 1]

# model_SVM.eval()
with th.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.cpu()
        labels = labels.cpu().numpy()
        y_true.extend(labels.tolist())

        inputs_np = inputs.numpy() if isinstance(inputs, th.Tensor) else inputs
        current_batch_size = inputs_np.shape[0]

        batch_pred = model_SVM.predict(inputs_np)
        batch_pred_prob = model_SVM.predict_proba(inputs_np)[:, 1]

        y_pred.extend(batch_pred.tolist())
        y_pred_prob.extend(batch_pred_prob.tolist())

# Convert y to numpy array
y_true = np.array(Y_test.cpu())
y_pred = np.array(y_pred)
y_pred_prob = np.array(y_pred_prob)

# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
auroc = roc_auc_score(y_true, y_pred_prob)

print(f'Accuracy: {accuracy:.6f}')
print(f'Precision: {precision:.6f}')
print(f'Recall: {recall:.6f}')
print(f'F1-score: {f1:.6f}')
print(f'AUROC: {auroc:.6f}')

print("End testing the final model - SVM")

# Free memory
del y_pred, y_pred_prob, y_true, accuracy, precision, recall, f1, auroc
gc.collect()
th.cuda.empty_cache()

Start testing the final model - SVM


ValueError: Found array with dim 3. SVC expected <= 2.

In [None]:
import xgboost as xgb

In [None]:
model_XGB = xgb.XGBClassifier()

# Train the model
for X_batch, Y_batch in train_loader:
    X_batch = X_batch.cpu()
    Y_batch = Y_batch.cpu()
    model_XGB.fit(X_batch, Y_batch.ravel())

In [None]:
print("Start testing the final model - XGB")

y_true = []
y_pred = []
y_pred_prob = []

# Testing the meta-model: XGB
# y_pred = model_XGB.predict(X_test.cpu())
# y_pred_prob = model_XGB.predict_proba(X_test)[:, 1]

model_XGB.eval()
with th.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.cpu()
        labels = labels.cpu().numpy()
        y_true.extend(labels.tolist())

        inputs_np = inputs.numpy() if isinstance(inputs, th.Tensor) else inputs
        current_batch_size = inputs_np.shape[0]

        batch_pred = model_XGB.predict(inputs_np)
        batch_pred_prob = model_XGB.predict_proba(inputs_np)[:, 1]

        y_pred.extend(batch_pred.tolist())
        y_pred_prob.extend(batch_pred_prob.tolist())

# Convert y to numpy array
y_true = np.array(Y_test.cpu())
y_pred = np.array(y_pred)
y_pred_prob = np.array(y_pred_prob)

# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
auroc = roc_auc_score(y_true, y_pred_prob)

print(f'Accuracy: {accuracy:.6f}')
print(f'Precision: {precision:.6f}')
print(f'Recall: {recall:.6f}')
print(f'F1-score: {f1:.6f}')
print(f'AUROC: {auroc:.6f}')

print("End testing the final model - XGB")

# Free memory
del y_pred, y_pred_prob, y_true, accuracy, precision, recall, f1, auroc
gc.collect()
th.cuda.empty_cache()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_RF = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
for X_batch, Y_batch in train_loader:
    X_batch = X_batch.cpu()
    Y_batch = Y_batch.cpu()
    model_RF.fit(X_batch, Y_batch.ravel())

In [None]:
print("Start testing the final model - RF")

y_true = []
y_pred = []
y_pred_prob = []

# Testing the meta-model: RF
# y_pred = model_RF.predict(X_test.cpu())
# y_pred_prob = model_RF.predict_proba(X_test)[:, 1]

model_RF.eval()
with th.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.cpu()
        labels = labels.cpu().numpy()
        y_true.extend(labels.tolist())

        inputs_np = inputs.numpy() if isinstance(inputs, th.Tensor) else inputs
        current_batch_size = inputs_np.shape[0]

        batch_pred = model_RF.predict(inputs_np)
        batch_pred_prob = model_RF.predict_proba(inputs_np)[:, 1]

        y_pred.extend(batch_pred.tolist())
        y_pred_prob.extend(batch_pred_prob.tolist())

# Convert y to numpy array
y_true = np.array(Y_test.cpu())
y_pred = np.array(y_pred)
y_pred_prob = np.array(y_pred_prob)

# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
auroc = roc_auc_score(y_true, y_pred_prob)

print(f'Accuracy: {accuracy:.6f}')
print(f'Precision: {precision:.6f}')
print(f'Recall: {recall:.6f}')
print(f'F1-score: {f1:.6f}')
print(f'AUROC: {auroc:.6f}')

print("End testing the final model - RF")

# Free memory
del y_pred, y_pred_prob, y_true, accuracy, precision, recall, f1, auroc
gc.collect()
th.cuda.empty_cache()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
model_GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
for X_batch, Y_batch in train_loader:
    X_batch = X_batch.cpu()
    Y_batch = Y_batch.cpu()
    model_GBC.fit(X_batch, Y_batch.ravel())

In [None]:
print("Start testing the final model - GBC")

y_true = []
y_pred = []
y_pred_prob = []

# Testing the meta-model: RF
# y_pred = model_GBC.predict(X_test.cpu())
# y_pred_prob = model_GBC.predict_proba(X_test)[:, 1]

model_GBC.eval()
with th.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.cpu()
        labels = labels.cpu().numpy()
        y_true.extend(labels.tolist())

        inputs_np = inputs.numpy() if isinstance(inputs, th.Tensor) else inputs
        current_batch_size = inputs_np.shape[0]

        batch_pred = model_GBC.predict(inputs_np)
        batch_pred_prob = model_GBC.predict_proba(inputs_np)[:, 1]

        y_pred.extend(batch_pred.tolist())
        y_pred_prob.extend(batch_pred_prob.tolist())

# Convert y to numpy array
y_true = np.array(Y_test.cpu())
y_pred = np.array(y_pred)
y_pred_prob = np.array(y_pred_prob)

# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
auroc = roc_auc_score(y_true, y_pred_prob)

print(f'Accuracy: {accuracy:.6f}')
print(f'Precision: {precision:.6f}')
print(f'Recall: {recall:.6f}')
print(f'F1-score: {f1:.6f}')
print(f'AUROC: {auroc:.6f}')

print("End testing the final model - GBC")

# Free memory
del y_pred, y_pred_prob, y_true, accuracy, precision, recall, f1, auroc
gc.collect()
th.cuda.empty_cache()