# Loading and splitting Dataset
The goal of this notebook is to continue the development of the DeepTropism model using a Pytorch.<br>
The dataset was already created on the previous notebook and all the HIV-1 env V3 loop sequences where aligned.


In [1]:
# Libraries used on the analysis
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import random

First we load the Dataframe with all the sequences published on referenced articles.<br>


In [2]:
df = pd.read_csv('/home/gabriel/Documents/Repos/DeepTropism/datasets/processed_tsv/deeptropism_profiled_dataset.tsv', 
                 sep='\t')
df.head()

Unnamed: 0,seq_name,dataset,sequence,seq_len,sequence_aligned,label,label_numeric
0,X138.EU074781.BG.CXCR4,cm,CRPNNTRKSGPQYTIIGDIAC,21,C-RPNN--TRKS------GPQ-----------YTIIGDIA---C,CXCR4,1
1,115.KF770295.C.CCR5,cm,CIRPNNNTRKSVRIGPGQTFYATGEIIGDIRKAHC,35,CIRPNNNTRKSVRI----GPGQTFYAT-----GEIIGDIRKAHC,CCR5,0
2,NIRT379.KF766537.C.CCR5,cm,CIRPNNNTRKSIRIGPGQTFYATGEIIGDIRQAHC,35,CIRPNNNTRKSIRI----GPGQTFYAT-----GEIIGDIRQAHC,CCR5,0
3,703010167.JQ779893.C.CCR5,cm,CIRPGNNTRKSIRIGPGQTYFSTGEIIGNIRQAHC,35,CIRPGNNTRKSIRI----GPGQTYFST-----GEIIGNIRQAHC,CCR5,0
4,19956.GU455522.B.CCR5,cm,CIRPNNNTRKSIPMGPGKAFYTTGDIIGDIRQAHC,35,CIRPNNNTRKSIPM----GPGKAFYTT-----GDIIGDIRQAHC,CCR5,0


In [3]:
df.shape

(9550, 7)

Check the size of the column of the alig

In [4]:
set(df['sequence_aligned'].apply(len))

{44}

In [5]:
df = df[df.label != 'validation']
df.shape

(9550, 7)

In [6]:
df.label.value_counts()

CCR5     7705
CXCR4     937
R5X4      908
Name: label, dtype: int64

In [7]:
df.label_numeric.value_counts()

0    7705
1    1845
Name: label_numeric, dtype: int64

# Filtering the dataset by unique sequences
To improve the quality of our training and avoid bias we are going to create a dataset with only unique sequences from the original Dataframe.
This is going to be the dataset for the development of our model based on Deep Neural Network.

In [8]:
df_unique = df.drop_duplicates(subset=['sequence_aligned'], keep='first')

In [9]:
df_unique.shape

(3608, 7)

In [10]:
df_unique.head()

Unnamed: 0,seq_name,dataset,sequence,seq_len,sequence_aligned,label,label_numeric
0,X138.EU074781.BG.CXCR4,cm,CRPNNTRKSGPQYTIIGDIAC,21,C-RPNN--TRKS------GPQ-----------YTIIGDIA---C,CXCR4,1
1,115.KF770295.C.CCR5,cm,CIRPNNNTRKSVRIGPGQTFYATGEIIGDIRKAHC,35,CIRPNNNTRKSVRI----GPGQTFYAT-----GEIIGDIRKAHC,CCR5,0
2,NIRT379.KF766537.C.CCR5,cm,CIRPNNNTRKSIRIGPGQTFYATGEIIGDIRQAHC,35,CIRPNNNTRKSIRI----GPGQTFYAT-----GEIIGDIRQAHC,CCR5,0
3,703010167.JQ779893.C.CCR5,cm,CIRPGNNTRKSIRIGPGQTYFSTGEIIGNIRQAHC,35,CIRPGNNTRKSIRI----GPGQTYFST-----GEIIGNIRQAHC,CCR5,0
4,19956.GU455522.B.CCR5,cm,CIRPNNNTRKSIPMGPGKAFYTTGDIIGDIRQAHC,35,CIRPNNNTRKSIPM----GPGKAFYTT-----GDIIGDIRQAHC,CCR5,0


In [11]:
df_unique.label_numeric.value_counts()

0    2779
1     829
Name: label_numeric, dtype: int64

In [12]:
df_unique.label.value_counts()

CCR5     2779
R5X4      485
CXCR4     344
Name: label, dtype: int64

In [13]:
df_unique.shape

(3608, 7)

## Creating Dataframes for comparing performance against other methods
In order to evaluate our model against the others already published, we are going to create separate Dataframes for each method filterinf by the 'dataset' column

In [14]:
df.dataset.value_counts()

newdb         2998
cm            2679
hivcopred     2335
geno2pheno    1188
webpssm        350
Name: dataset, dtype: int64

In [15]:
df_newdb = df[df.dataset == 'newdb']
df_cm = df[df.dataset == 'cm']
df_hivcopred = df[df.dataset == 'hivcopred']
df_geno2pheno = df[df.dataset == 'geno2pheno']
df_webpssm = df[df.dataset == 'webpssm']

We are going to create indices and set it to variables to make our cross validation reproducible. Our dataset is going to consist on:<br>
* Training = 80 %
* Validation = 10 %
* Test = 10 %

In [16]:
# Create a list of indices and shuffle it using seed
random.seed(42)
size = df_unique.shape[0]
list_indices = list(range(size))
random.shuffle(list_indices)

In [17]:
# Now create the list of indices for trainning, validation and test
test_indices = list_indices[:int(size/10)]
train_val_indices = list_indices[int(size/10):]

In [18]:
len(test_indices)

360

In [19]:
len(train_val_indices)

3248

In [20]:
assert len(train_val_indices) + len(test_indices) == len(list_indices), "Splitting indices with error"

## Creating the Dataloaders for training

In [21]:
def get_array_from_sequence(protein_sequence):
    """
    Function to convert a protein sequence into a tensor.
    Each amino acid is represented by an numpy array of zeros of size 26,
    and the dict_aa_pos defines the position to be converted to 1.
    
    The function iterates over the protein sequences and stacks the arrays.
    At the end the arrays are linearized and converted to a tensor of size
    n x 26, with n the size of the protein.
    
    If the character is not present on the dict_aa_pos (eg. '-') the respective
    array is formed by zeros, and represents a missing value.
    """
    dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}
    
    f_array = np.zeros(26)
    for aa in protein_sequence:
        arr = np.zeros(26)
        if dict_aa_pos.get(aa):
            arr[dict_aa_pos.get(aa)] = 1
        f_array = np.vstack((f_array, arr))
    f_array = np.delete(f_array, 0,0)
    
    #return torch.from_numpy((f_array.flatten()).astype(float))
    return f_array.flatten().astype(float)

In [24]:
dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}

In [25]:
protein_sequence = 'CTRPSNNTRKSVRI----GPGQAFFAT-----GEIIGDIRQAHC'
f_array = np.zeros(26)
for aa in protein_sequence:
    arr = np.zeros(26)
    if dict_aa_pos.get(aa):
        arr[dict_aa_pos.get(aa)] = 1
    f_array = np.vstack((f_array, arr))
f_array = np.delete(f_array, 0,0)

In [27]:
f_array[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [68]:
len(dict_aa_pos.keys())

26

In [69]:
# Create list to append data from the df
list_data = []
list_labels = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df.iterrows():
    list_data.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels.append(int(row.label_numeric))

In [70]:
len(list_data) == len(list_labels)

True

In [71]:
# For Test set
test_data = []
test_label = []
for j in test_indices:
    test_data.append(list_data[j])
    test_label.append(np.array(list_labels[j]))

test_tensor_x = torch.stack([torch.from_numpy(i) for i in test_data]) # transform to torch tensors
test_tensor_y = torch.stack([torch.from_numpy(i) for i in test_label])

test_dataset = torch.utils.data.TensorDataset(test_tensor_x,test_tensor_y) # create your test dataset
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64) # create your dataloader


In [73]:
# Now we define the
len(train_val_indices)

3248

In [74]:
crossval_val_indices = np.array_split(train_val_indices,5)

In [75]:
len(crossval_val_indices[0])

650

In [77]:
# For training and validation set
# Define the cross validation indices for training and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    training_data = []
    training_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    training_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in training_indices:
        training_data.append(list_data[k])
        training_label.append(np.array(list_labels[k]))

    training_tensor_x = torch.stack([torch.from_numpy(i) for i in training_data]) # transform to torch tensors
    training_tensor_y = torch.stack([torch.from_numpy(i) for i in training_label])

    training_dataset = torch.utils.data.TensorDataset(training_tensor_x,training_tensor_y) # create your test dataset
    training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size=64) # create your dataloader

Cross Validation: 1
Cross Validation: 2
Cross Validation: 3
Cross Validation: 4
Cross Validation: 5


## Define the Deep Neural Network Architecture


In [80]:
class DeepTropism_1(nn.Module):
    def __init__(self):
        super(DeepTropism_1, self).__init__()
        self.linear1 = nn.Linear(1144,250)
        self.linear2 = nn.Linear(250,100)
        self.linear3 = nn.Linear(100,2)
    
    def forward(self,X):
        X = F.relu(self.linear1(X))
        X = F.relu(self.linear2(X))
        X = self.linear3(X)
        return F.log_softmax(X, dim=1)
 
model = DeepTropism_1().float()
#model = model.float()
model

DeepTropism_1(
  (linear1): Linear(in_features=1144, out_features=250, bias=True)
  (linear2): Linear(in_features=250, out_features=100, bias=True)
  (linear3): Linear(in_features=100, out_features=2, bias=True)
)

### For showing the metrics of the model

In [81]:
def show_metrics(y_true, y_score):
    # True positive
    tp = np.sum(y_true * y_score)
    # False positive
    fp = np.sum((y_true == 0) * y_score)
    # True negative
    tn = np.sum((y_true==0) * (y_score==0))
    # False negative
    fn = np.sum(y_true * (y_score==0))

    # True positive rate (sensitivity or recall)
    tpr = tp / (tp + fn)
    # False positive rate (fall-out)
    fpr = fp / (fp + tn)
    # Precision
    precision = tp / (tp + fp)
    # True negatvie tate (specificity)
    tnr = 1 - fpr
    # F1 score
    f1 = 2*tp / (2*tp + fp + fn)
    # ROC-AUC for binary classification
    auc = (tpr+tnr) / 2
    # MCC
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    print("True positive: ", tp)
    print("False positive: ", fp)
    print("True negative: ", tn)
    print("False negative: ", fn)

    print("True positive rate (recall): ", tpr)
    print("False positive rate: ", fpr)
    print("Precision: ", precision)
    print("True negative rate (Specificity): ", tnr)
    print("F1: ", f1)
    print("ROC-AUC: ", auc)
    print("MCC: ", mcc)

In [82]:
# Define the Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [83]:
# For Trainning and validation set DeepTropism_1
# Define the cross validation indices for trainning and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    trainning_data = []
    trainning_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    trainning_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in trainning_indices:
        trainning_data.append(list_data[k])
        trainning_label.append(np.array(list_labels[k]))

    trainning_tensor_x = torch.stack([torch.from_numpy(i) for i in trainning_data]) # transform to torch tensors
    trainning_tensor_y = torch.stack([torch.from_numpy(i) for i in trainning_label])

    trainning_dataset = torch.utils.data.TensorDataset(trainning_tensor_x,trainning_tensor_y) # create your test dataset
    trainning_dataloader = torch.utils.data.DataLoader(trainning_dataset, batch_size=64) # create your dataloader
    
    # Instantiante new model
    #model = DeepTropism_1().float()
        
    # Define Cross Validation Trainning Loop
    for epoch in range(400):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainning_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            #print(running_loss)
            #if i % 3239 == 0:    # print every 3239 mini-batches
            if epoch % 20 == 0 and i % 3248 == 0:
                print('[%d, %3d] loss: %.9f' %
                      (epoch + 1, i + 1, running_loss / 50))
                correct = 0
                total = 0
                error = 0
                labels_array = np.empty([0])
                predict_array = np.empty([0])

                with torch.no_grad():
                    for data in validation_dataloader:
                        images, labels = data
                        outputs = model(images.float())
                        _, predicted = torch.max(outputs.data, 1)

                        labels_array = np.concatenate([labels_array, labels])
                        predict_array = np.concatenate([predict_array, predicted])

                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        error += (predicted != labels).sum().item()

                print(f'Neural Network accuracy for validation set {n + 1}: {round(100.0 * correct/total, 2)}%')
                
                # Evaluate model against test set
                correct = 0
                total = 0
                error = 0
                labels_array = np.empty([0])
                predict_array = np.empty([0])

                with torch.no_grad():
                    for data in test_dataloader:
                        images, labels = data
                        outputs = model(images.float())
                        _, predicted = torch.max(outputs.data, 1)

                        labels_array = np.concatenate([labels_array, labels])
                        predict_array = np.concatenate([predict_array, predicted])

                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        error += (predicted != labels).sum().item()

                print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')

                
            running_loss = 0.0
            
            
    torch.save(model.state_dict(), f'model_cv{n+1}.ptb')
    print('Finished Training')
    

Cross Validation: 1
[1,   1] loss: 0.014330550
Neural Network accuracy for validation set 1: 12.46%
Neural Network accuracy on test set: 14.44%
[21,   1] loss: 0.001926502
Neural Network accuracy for validation set 1: 87.54%
Neural Network accuracy on test set: 85.56%
[41,   1] loss: 0.001328466
Neural Network accuracy for validation set 1: 94.46%
Neural Network accuracy on test set: 90.56%
[61,   1] loss: 0.000966116
Neural Network accuracy for validation set 1: 95.54%
Neural Network accuracy on test set: 93.06%
[81,   1] loss: 0.000792519
Neural Network accuracy for validation set 1: 96.0%
Neural Network accuracy on test set: 94.72%
[101,   1] loss: 0.000720771
Neural Network accuracy for validation set 1: 96.31%
Neural Network accuracy on test set: 95.28%
[121,   1] loss: 0.000680599
Neural Network accuracy for validation set 1: 96.15%
Neural Network accuracy on test set: 95.0%
[141,   1] loss: 0.000659886
Neural Network accuracy for validation set 1: 96.15%
Neural Network accuracy 

In [84]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)
        
        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy for test set: {round(100.0 * correct/total, 2)}%')

Neural Network accuracy for test set: 96.94%


In [86]:
show_metrics(labels_array, predict_array)

True positive:  43.0
False positive:  2.0
True negative:  306
False negative:  9.0
True positive rate (recall):  0.8269230769230769
False positive rate:  0.006493506493506494
Precision:  0.9555555555555556
True negative rate (Specificity):  0.9935064935064936
F1:  0.8865979381443299
ROC-AUC:  0.9102147852147853
MCC:  0.872080953293109


## Performance of DNN model against published datasets

## For Newdb dataset

In [90]:
# Create list to append data from the df
list_data_newdb = []
list_labels_newdb = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_newdb.iterrows():
    list_data_newdb.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_newdb.append(np.array(int(row.label_numeric)))

newdb_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_newdb]) # transform to torch tensors
newdb_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_newdb])

newdb_dataset = torch.utils.data.TensorDataset(newdb_tensor_x,newdb_tensor_y) # create your test dataset
newdb_dataloader = torch.utils.data.DataLoader(newdb_dataset, batch_size=64) # create your dataloader

In [91]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in newdb_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)
print('Finished Training')

Neural Network accuracy on test set: 92.23%
True positive:  436.0
False positive:  6.0
True negative:  2329
False negative:  227.0
True positive rate (recall):  0.6576168929110106
False positive rate:  0.002569593147751606
Precision:  0.9864253393665159
True negative rate (Specificity):  0.9974304068522484
F1:  0.7891402714932126
ROC-AUC:  0.8275236498816294
MCC:  0.7667985941443346
Finished Training


## For CM dataset

In [92]:
# Create list to append data from the df
list_data_cm = []
list_labels_cm = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_cm.iterrows():
    list_data_cm.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_cm.append(np.array(int(row.label_numeric)))

cm_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_cm]) # transform to torch tensors
cm_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_cm])

cm_dataset = torch.utils.data.TensorDataset(cm_tensor_x,cm_tensor_y) # create your test dataset
cm_dataloader = torch.utils.data.DataLoader(cm_dataset, batch_size=64) # create your dataloader

In [93]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in cm_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on test set: 99.81%
True positive:  322.0
False positive:  2.0
True negative:  2352
False negative:  3.0
True positive rate (recall):  0.9907692307692307
False positive rate:  0.0008496176720475786
Precision:  0.9938271604938271
True negative rate (Specificity):  0.9991503823279524
F1:  0.9922958397534669
ROC-AUC:  0.9949598065485916
MCC:  0.9912355694326168


## For hivcopred dataset

In [94]:
# Create list to append data from the df
list_data_hivcopred = []
list_labels_hivcopred = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_hivcopred.iterrows():
    list_data_hivcopred.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_hivcopred.append(np.array(int(row.label_numeric)))

hivcopred_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_hivcopred]) # transform to torch tensors
hivcopred_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_hivcopred])

hivcopred_dataset = torch.utils.data.TensorDataset(hivcopred_tensor_x,hivcopred_tensor_y) # create your test dataset
hivcopred_dataloader = torch.utils.data.DataLoader(hivcopred_dataset, batch_size=64) # create your dataloader

In [95]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in hivcopred_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on hivcopred set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on hivcopred set: 91.43%
True positive:  373.0
False positive:  6.0
True negative:  1762
False negative:  194.0
True positive rate (recall):  0.6578483245149912
False positive rate:  0.003393665158371041
Precision:  0.9841688654353562
True negative rate (Specificity):  0.996606334841629
F1:  0.7885835095137421
ROC-AUC:  0.8272273296783101
MCC:  0.7610412438737292


## For geno2pheno dataset

In [96]:
# Create list to append data from the df
list_data_geno2pheno = []
list_labels_geno2pheno = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_geno2pheno.iterrows():
    list_data_geno2pheno.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_geno2pheno.append(np.array(int(row.label_numeric)))

geno2pheno_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_geno2pheno]) # transform to torch tensors
geno2pheno_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_geno2pheno])

geno2pheno_dataset = torch.utils.data.TensorDataset(geno2pheno_tensor_x,geno2pheno_tensor_y) # create your test dataset
geno2pheno_dataloader = torch.utils.data.DataLoader(geno2pheno_dataset, batch_size=64) # create your dataloader

In [97]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in geno2pheno_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on hivcopred set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on hivcopred set: 96.89%
True positive:  182.0
False positive:  4.0
True negative:  969
False negative:  33.0
True positive rate (recall):  0.8465116279069768
False positive rate:  0.0041109969167523125
Precision:  0.978494623655914
True negative rate (Specificity):  0.9958890030832477
F1:  0.9077306733167082
ROC-AUC:  0.9212003154951123
MCC:  0.8924913193036215


## For webpssm

In [98]:
# Create list to append data from the df
list_data_webpssm = []
list_labels_webpssm = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_webpssm.iterrows():
    list_data_webpssm.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_webpssm.append(np.array(int(row.label_numeric)))

webpssm_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_webpssm]) # transform to torch tensors
webpssm_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_webpssm])

webpssm_dataset = torch.utils.data.TensorDataset(webpssm_tensor_x,webpssm_tensor_y) # create your test dataset
webpssm_dataloader = torch.utils.data.DataLoader(webpssm_dataset, batch_size=64) # create your dataloader

In [99]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in webpssm_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on webpssm set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on webpssm set: 92.86%
True positive:  54.0
False positive:  4.0
True negative:  271
False negative:  21.0
True positive rate (recall):  0.72
False positive rate:  0.014545454545454545
Precision:  0.9310344827586207
True negative rate (Specificity):  0.9854545454545455
F1:  0.8120300751879699
ROC-AUC:  0.8527272727272728
MCC:  0.7785035408587704
