# Loading and splitting Dataset
The goal of this notebook is to continue the development of the DeepTropism model using a Pytorch.<br>
The dataset was already created on the previous notebook and all the HIV-1 env V3 loop sequences where aligned.


In [8]:
# Libraries used on the analysis
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import random

First we load the Dataframe with all the sequences published on referenced articles.<br>
The D


In [9]:
df = pd.read_csv('/home/gabriel/Documents/Repos/DeepTropism/datasets/dataset_profile_final.tsv', 
                 sep='\t', names=['seq_name', 'dataset', 'label', 'sequence_aligned'])
df.head()

Unnamed: 0,seq_name,dataset,label,sequence_aligned
0,B.FR.83.HXB2_LAI_IIIB_BRU.K03455,LANL,-,CTRPNNN-TRKRI-RIQRGPGRAFVTI-----GK-IGNMRQAHC
1,A1.CD.02.LA01AlPr.KU168256,LANL,-,CIRPNNN-TRKGI-GI--GPGQTFYAA-----DAIIGNIRHAYC
2,A1.CM.08.886_24.KP718928,LANL,-,CSRPNNN-TRRSI-RI--GPGQSFYAT-----GEIIGDIREARC
3,A1.ES.15.100_117.KY496622,LANL,-,CTRPGNN-TRTSI-RI--GPGQAFYAT-----GDIIGDIRKAYC
4,A1.KE.11.DEMA111KE002.KF716474,LANL,-,CTRPNNN-TRKSV-RI--GPGQAFFAT-----GEVIGKIRKAYC


In [10]:
df.shape

(9748, 4)

In [11]:
### Check the size of the column of the alig

In [12]:
set(df['sequence_aligned'].apply(len))

{44}

### Show the original datasets present on Dataframe

In [13]:
df.dataset.value_counts()

newdb         2998
cm            2679
hivcopred     2335
geno2pheno    1188
webpssm        350
LANL           198
Name: dataset, dtype: int64

In [14]:
df = df[df.label != 'validation']
df.shape

(9748, 4)

In [15]:
df.label.value_counts()

CCR5     7705
CXCR4     937
R5X4      908
-         198
Name: label, dtype: int64

In [16]:
df

Unnamed: 0,seq_name,dataset,label,sequence_aligned
0,B.FR.83.HXB2_LAI_IIIB_BRU.K03455,LANL,-,CTRPNNN-TRKRI-RIQRGPGRAFVTI-----GK-IGNMRQAHC
1,A1.CD.02.LA01AlPr.KU168256,LANL,-,CIRPNNN-TRKGI-GI--GPGQTFYAA-----DAIIGNIRHAYC
2,A1.CM.08.886_24.KP718928,LANL,-,CSRPNNN-TRRSI-RI--GPGQSFYAT-----GEIIGDIREARC
3,A1.ES.15.100_117.KY496622,LANL,-,CTRPGNN-TRTSI-RI--GPGQAFYAT-----GDIIGDIRKAYC
4,A1.KE.11.DEMA111KE002.KF716474,LANL,-,CTRPNNN-TRKSV-RI--GPGQAFFAT-----GEVIGKIRKAYC
...,...,...,...,...
9743,DUR.AM262127.O.CCR5,cm,CCR5,CVRPGDNSVKEMRA----GPMAWYSME--LERNGSRTNSRTAFC
9744,DUR.X84327.O.CCR5,cm,CCR5,CVRPGNNSVQEIKI----GPMAWYSMQ--IEREGKGANSRTAFC
9745,CCR5_AM262114_21502_FR_1995_O,geno2pheno,CCR5,CVRPGSNSVQEIKI----GPMAWYSMQ--LEQDGKRANARTAFC
9746,CXCR4/GPR15_NDK_13796_CD_1983_D,geno2pheno,CXCR4,CTRPYKYTRQRTSI----GLRQSLYTI--TGKKKKTGYIGQAHC


## Remove samples from LANL of the Dataset

In [17]:
df_labeled = df[df.dataset != "LANL"]

In [18]:
# Function to call labels
def tropism_label(row):
    """
    Define numeric label, 'CCR5' as 0 
    and 'CXCR4' or 'R5X4' as 1
    """
    # For CCR5
    if str(row.label).strip() == 'CCR5':
        return 0
    # For CXCR4
    elif str(row.label).strip() == 'CXCR4':
        return 1
    # For R5X4
    elif str(row.label).strip() == 'R5X4':
        return 1

In [19]:
df_labeled['label_numeric'] = df_labeled.apply(tropism_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
df_labeled

Unnamed: 0,seq_name,dataset,label,sequence_aligned,label_numeric
198,X138.EU074781.BG.CXCR4,cm,CXCR4,C-RPNN--TRKS------GPQ-----------YTIIGDIA---C,1
199,CCR5_AG1030_-_FR_-_02_AG,geno2pheno,CCR5,CSRPNNN-TRKSRI----GPGQTFYAT-----------DIGDQC,0
200,CCR5_AG1005_-_FR_-_02_AG,geno2pheno,CCR5,CTRPNNN-TRKSIH----PGRAFYATV-----------GPQAHC,0
201,-.FJ652339.02_AG.CCR5,cm,CCR5,CTRPNNNTRS--------VRIGPGQAF-------YAGDIGIQAC,0
202,-.FJ375998.C.CCR5,cm,CCR5,CRPNNTRKMR--------IGPGQTYAT-------GDIIGIRAHC,0
...,...,...,...,...,...
9743,DUR.AM262127.O.CCR5,cm,CCR5,CVRPGDNSVKEMRA----GPMAWYSME--LERNGSRTNSRTAFC,0
9744,DUR.X84327.O.CCR5,cm,CCR5,CVRPGNNSVQEIKI----GPMAWYSMQ--IEREGKGANSRTAFC,0
9745,CCR5_AM262114_21502_FR_1995_O,geno2pheno,CCR5,CVRPGSNSVQEIKI----GPMAWYSMQ--LEQDGKRANARTAFC,0
9746,CXCR4/GPR15_NDK_13796_CD_1983_D,geno2pheno,CXCR4,CTRPYKYTRQRTSI----GLRQSLYTI--TGKKKKTGYIGQAHC,1


# Filtering the dataset by unique sequences
To improve the quality of our trainning and avoid bias we are going to create a dataset with only unique sequences from the original Dataframe.
This is going to be the dataset for the development of our model based on Deep Neural Network.

In [193]:
df_unique = df_labeled.drop_duplicates(subset=['sequence_aligned'], keep='first')

In [194]:
df_unique.head()

Unnamed: 0,seq_name,dataset,label,sequence_aligned,label_numeric
198,X138.EU074781.BG.CXCR4,cm,CXCR4,C-RPNN--TRKS------GPQ-----------YTIIGDIA---C,1
199,CCR5_AG1030_-_FR_-_02_AG,geno2pheno,CCR5,CSRPNNN-TRKSRI----GPGQTFYAT-----------DIGDQC,0
200,CCR5_AG1005_-_FR_-_02_AG,geno2pheno,CCR5,CTRPNNN-TRKSIH----PGRAFYATV-----------GPQAHC,0
201,-.FJ652339.02_AG.CCR5,cm,CCR5,CTRPNNNTRS--------VRIGPGQAF-------YAGDIGIQAC,0
202,-.FJ375998.C.CCR5,cm,CCR5,CRPNNTRKMR--------IGPGQTYAT-------GDIIGIRAHC,0


In [195]:
df_unique.label_numeric.value_counts()

0    2783
1     825
Name: label_numeric, dtype: int64

In [196]:
df_unique.label.value_counts()

CCR5     2783
R5X4      485
CXCR4     340
Name: label, dtype: int64

In [197]:
df_unique.shape

(3608, 5)

## Creating Dataframes for comparing performance against other methods
In order to evaluate our model against the others already published, we are going to create separate Dataframes for each method filterinf by the 'dataset' column

In [198]:
df_labeled.dataset.value_counts()

newdb         2998
cm            2679
hivcopred     2335
geno2pheno    1188
webpssm        350
Name: dataset, dtype: int64

In [199]:
df_newdb = df_labeled[df_labeled.dataset == 'newdb']
df_cm = df_labeled[df_labeled.dataset == 'cm']
df_hivcopred = df_labeled[df_labeled.dataset == 'hivcopred']
df_geno2pheno = df_labeled[df_labeled.dataset == 'geno2pheno']
df_webpssm = df_labeled[df_labeled.dataset == 'webpssm']

In [200]:
print(df_newdb.shape)
print(df_cm.shape)
print(df_hivcopred.shape)
print(df_geno2pheno.shape)
print(df_webpssm.shape)

(2998, 5)
(2679, 5)
(2335, 5)
(1188, 5)
(350, 5)


# Spliting the Dataset for Cross Validation
We are going to create indices and set it to variables to make our cross validation reproducible. Our dataset is going to consist on:<br>
* Trainning = 80 %
* Validation = 10 %
* Test = 10 %

In [201]:
# Create a list of indices and shuffle it using seed
random.seed(42)
size = df_unique.shape[0]
list_indices = list(range(size))
random.shuffle(list_indices)

In [202]:
# Now create the list of indices for trainning, validation and test
test_indices = list_indices[:int(size/10)]
train_val_indices = list_indices[int(size/10):]

In [203]:
len(test_indices)

360

In [204]:
len(train_val_indices)

3248

In [205]:
assert len(train_val_indices) + len(test_indices) == len(list_indices), "Splitting indices with error"

## Creating the Dataloaders for trainning

In [206]:
def get_array_from_sequence(protein_sequence):
    """
    Function to convert a protein sequence into a tensor.
    Each amino acid is represented by an numpy array of zeros of size 26,
    and the dict_aa_pos defines the position to be converted to 1.
    
    The function iterates over the protein sequences and stacks the arrays.
    At the end the arrays are linearized and converted to a tensor of size
    n x 26, with n the size of the protein.
    
    If the character is not present on the dict_aa_pos (eg. '-') the respective
    array is formed by zeros, and represents a missing value.
    """
    dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}
    
    f_array = np.zeros(26)
    for aa in protein_sequence:
        arr = np.zeros(26)
        if dict_aa_pos.get(aa):
            arr[dict_aa_pos.get(aa)] = 1
        f_array = np.vstack((f_array, arr))
    f_array = np.delete(f_array, 0,0)
    #return f_array.astype(float)
    return f_array.flatten().astype(float)

In [207]:
dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}

In [208]:
len(dict_aa_pos.keys())

26

In [209]:
# Create list to append data from the df
list_data = []
list_labels = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_unique.iterrows():
    list_labels.append(int(row.label_numeric))
    list_data.append(get_array_from_sequence(row.sequence_aligned))

In [210]:
len(list_data) == len(list_labels)

True

In [211]:
np.array(list_data).shape

(3608, 1144)

In [212]:
# For Test set
test_data = []
test_label = []
for j in test_indices:
    test_data.append(list_data[j])
    test_label.append(np.array(list_labels[j]))

test_tensor_x = torch.stack([torch.from_numpy(i) for i in test_data]) # transform to torch tensors
test_tensor_y = torch.stack([torch.from_numpy(i) for i in test_label])

test_dataset = torch.utils.data.TensorDataset(test_tensor_x,test_tensor_y) # create your test dataset
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64) # create your dataloader


In [213]:
# Now we define the
len(train_val_indices)

3248

In [171]:
crossval_val_indices = np.array_split(train_val_indices,5)

In [172]:
len(crossval_val_indices[0])

650

In [173]:
# For Trainning and validation set
# Define the cross validation indices for trainning and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    trainning_data = []
    trainning_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    trainning_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in trainning_indices:
        trainning_data.append(list_data[k])
        trainning_label.append(np.array(list_labels[k]))

    trainning_tensor_x = torch.stack([torch.from_numpy(i) for i in trainning_data]) # transform to torch tensors
    trainning_tensor_y = torch.stack([torch.from_numpy(i) for i in trainning_label])

    trainning_dataset = torch.utils.data.TensorDataset(trainning_tensor_x,trainning_tensor_y) # create your test dataset
    trainning_dataloader = torch.utils.data.DataLoader(trainning_dataset, batch_size=64) # create your dataloader

Cross Validation: 1
Cross Validation: 2
Cross Validation: 3
Cross Validation: 4
Cross Validation: 5


## Define the Deep Neural Network Architecture


In [174]:
class DeepTropism_1(nn.Module):
    def __init__(self):
        super(DeepTropism_1, self).__init__()
        self.linear1 = nn.Linear(1144,250)
        self.linear2 = nn.Linear(250,100)
        self.linear3 = nn.Linear(100,2)
    
    def forward(self,X):
        X = F.relu(self.linear1(X))
        X = F.relu(self.linear2(X))
        X = self.linear3(X)
        return F.log_softmax(X, dim=1)
 

model = DeepTropism_1().float()
model

DeepTropism_1(
  (linear1): Linear(in_features=1144, out_features=250, bias=True)
  (linear2): Linear(in_features=250, out_features=100, bias=True)
  (linear3): Linear(in_features=100, out_features=2, bias=True)
)

In [218]:
class DeepTropism_2(nn.Module):
    def __init__(self):
        super(DeepTropism_2, self).__init__()
        self.linear1 = nn.Linear(1144,600)
        self.linear2 = nn.Linear(600,300)
        self.linear3 = nn.Linear(300,50)
        self.linear4 = nn.Linear(50,2)
    
    def forward(self,X):
        X = F.relu(self.linear1(X))
        X = F.relu(self.linear2(X))
        X = F.relu(self.linear3(X))
        X = self.linear4(X)
        return F.log_softmax(X, dim=1)
 

model = DeepTropism_2().float()
model

DeepTropism_2(
  (linear1): Linear(in_features=1144, out_features=600, bias=True)
  (linear2): Linear(in_features=600, out_features=300, bias=True)
  (linear3): Linear(in_features=300, out_features=50, bias=True)
  (linear4): Linear(in_features=50, out_features=2, bias=True)
)

### For showing the metrics of the model

In [219]:
def show_metrics(y_true, y_score):
    # True positive
    tp = np.sum(y_true * y_score)
    # False positive
    fp = np.sum((y_true == 0) * y_score)
    # True negative
    tn = np.sum((y_true==0) * (y_score==0))
    # False negative
    fn = np.sum(y_true * (y_score==0))

    # True positive rate (sensitivity or recall)
    tpr = tp / (tp + fn)
    # False positive rate (fall-out)
    fpr = fp / (fp + tn)
    # Precision
    precision = tp / (tp + fp)
    # True negatvie tate (specificity)
    tnr = 1 - fpr
    # F1 score
    f1 = 2*tp / (2*tp + fp + fn)
    # ROC-AUC for binary classification
    auc = (tpr+tnr) / 2
    # MCC
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    print("True positive: ", tp)
    print("False positive: ", fp)
    print("True negative: ", tn)
    print("False negative: ", fn)

    print("True positive rate (recall): ", tpr)
    print("False positive rate: ", fpr)
    print("Precision: ", precision)
    print("True negative rate (Specificity): ", tnr)
    print("F1: ", f1)
    print("ROC-AUC: ", auc)
    print("MCC: ", mcc)

In [220]:
# Define the Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [221]:
# For Trainning and validation set DeepTropism_1
# Define the cross validation indices for trainning and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    trainning_data = []
    trainning_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    trainning_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in trainning_indices:
        trainning_data.append(list_data[k])
        trainning_label.append(np.array(list_labels[k]))

    trainning_tensor_x = torch.stack([torch.from_numpy(i) for i in trainning_data]) # transform to torch tensors
    trainning_tensor_y = torch.stack([torch.from_numpy(i) for i in trainning_label])

    trainning_dataset = torch.utils.data.TensorDataset(trainning_tensor_x,trainning_tensor_y) # create your test dataset
    trainning_dataloader = torch.utils.data.DataLoader(trainning_dataset, batch_size=64) # create your dataloader
    
    # Instantiante new model
    #model = DeepTropism_1().float()
        
    # Define Cross Validation Trainning Loop
    for epoch in range(200):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainning_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            #print(running_loss)
            #if i % 3239 == 0:    # print every 3239 mini-batches
            if epoch % 20 == 0 and i % 3248 == 0:
                print('[%d, %3d] loss: %.9f' %
                      (epoch + 1, i + 1, running_loss / 50))
                correct = 0
                total = 0
                error = 0
                labels_array = np.empty([0])
                predict_array = np.empty([0])

                with torch.no_grad():
                    for data in validation_dataloader:
                        images, labels = data
                        outputs = model(images.float())
                        _, predicted = torch.max(outputs.data, 1)

                        labels_array = np.concatenate([labels_array, labels])
                        predict_array = np.concatenate([predict_array, predicted])

                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        error += (predicted != labels).sum().item()

                print(f'Neural Network accuracy for validation set {n + 1}: {round(100.0 * correct/total, 2)}%')
                
                # Evaluate model against test set
                correct = 0
                total = 0
                error = 0
                labels_array = np.empty([0])
                predict_array = np.empty([0])

                with torch.no_grad():
                    for data in test_dataloader:
                        images, labels = data
                        outputs = model(images.float())
                        _, predicted = torch.max(outputs.data, 1)

                        labels_array = np.concatenate([labels_array, labels])
                        predict_array = np.concatenate([predict_array, predicted])

                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        error += (predicted != labels).sum().item()

                print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')

                
            running_loss = 0.0
            
            
    torch.save(model.state_dict(), f'model_cv{n+1}.ptb')
    print('Finished Training')
    

Cross Validation: 1
[1,   1] loss: 0.013544965
Neural Network accuracy for validation set 1: 78.0%
Neural Network accuracy on test set: 77.22%
[21,   1] loss: 0.013421898
Neural Network accuracy for validation set 1: 78.0%
Neural Network accuracy on test set: 77.22%
[41,   1] loss: 0.012217962
Neural Network accuracy for validation set 1: 78.0%
Neural Network accuracy on test set: 77.22%
[61,   1] loss: 0.011039900
Neural Network accuracy for validation set 1: 87.85%
Neural Network accuracy on test set: 85.83%
[81,   1] loss: 0.008590623
Neural Network accuracy for validation set 1: 89.23%
Neural Network accuracy on test set: 85.83%
[101,   1] loss: 0.006424741
Neural Network accuracy for validation set 1: 90.0%
Neural Network accuracy on test set: 87.5%
[121,   1] loss: 0.005122365
Neural Network accuracy for validation set 1: 90.62%
Neural Network accuracy on test set: 86.94%
[141,   1] loss: 0.004299819
Neural Network accuracy for validation set 1: 90.46%
Neural Network accuracy on 

In [222]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)
        
        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy for test set: {round(100.0 * correct/total, 2)}%')

Neural Network accuracy for test set: 87.22%


In [223]:
show_metrics(labels_array, predict_array)

True positive:  61.0
False positive:  25.0
True negative:  253
False negative:  21.0
True positive rate (recall):  0.7439024390243902
False positive rate:  0.08992805755395683
Precision:  0.7093023255813954
True negative rate (Specificity):  0.9100719424460432
F1:  0.7261904761904762
ROC-AUC:  0.8269871907352166
MCC:  0.6432289060049595


## Performance of DNN model against published datasets

In [224]:
df_newdb
df_cm
df_hivcopred
df_geno2pheno
df_webpssm

Unnamed: 0,seq_name,dataset,label,sequence_aligned,label_numeric
225,C.ZW.01.TC30__phen_SI,webpssm,CXCR4,CTRPGNNTIGP-------GRTFYATDR-------IIGDIRQAHC,1
263,DU179MAY00_ZA_C_SI/R5X4_u19_COETZER_(IN_PREP),webpssm,CXCR4,CTRPGNKTIRSIR-----LGPGQAFYT-------NKGDIRQASC,1
264,DU179D_ZA_C_SI/CXCR4_u19_COETZER_(IN_PREP),webpssm,CXCR4,CTRPGNKTIRSIR-----IGPGRTFYT-------NKGDIRQAYC,1
302,C.ZW.01.TC11__phen_NSI,webpssm,CCR5,CTRPNNNTRKSIW-----LGPGQAFYA-------NIIGDIRQAC,0
466,C.MW.93.960__phen_NSI,webpssm,CCR5,CTRPNNTRKSIRI-----GPGQTFYAT------NEIIGNREAHC,0
...,...,...,...,...,...
9525,C.ZA.98.TV018__phen_NSI,webpssm,CCR5,CTRPNNNTRRSMRI----RPGQTFYAT-----GEIIGDIRQAYC,0
9526,C.FR.92.FRMP130__phen_NSI,webpssm,CCR5,CTRPNNNTRRSVRI----GPGQTFYAT-----GAIIGDIRQAHC,0
9527,C.ZW.01.TC33__phen_NSI,webpssm,CCR5,CTRPNNNTRTSVRI----GPGQAFYAT-----GDIIGDIRQAHC,0
9528,C.FR.93.FRMP37__phen_NSI,webpssm,CCR5,CTRPSNNTRKSIRI----GPGQAFYAT-----NGIIGDIRAAHC,0


## For Newdb dataset

In [225]:
# Create list to append data from the df
list_data_newdb = []
list_labels_newdb = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_newdb.iterrows():
    list_data_newdb.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_newdb.append(np.array(int(row.label_numeric)))

newdb_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_newdb]) # transform to torch tensors
newdb_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_newdb])

newdb_dataset = torch.utils.data.TensorDataset(newdb_tensor_x,newdb_tensor_y) # create your test dataset
newdb_dataloader = torch.utils.data.DataLoader(newdb_dataset, batch_size=64) # create your dataloader

In [226]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in newdb_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on test set: 98.7%
True positive:  640.0
False positive:  16.0
True negative:  2319
False negative:  23.0
True positive rate (recall):  0.9653092006033183
False positive rate:  0.006852248394004282
Precision:  0.975609756097561
True negative rate (Specificity):  0.9931477516059957
F1:  0.9704321455648218
ROC-AUC:  0.979228476104657
MCC:  0.962116036493835


## For CM dataset

In [227]:
# Create list to append data from the df
list_data_cm = []
list_labels_cm = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_cm.iterrows():
    list_data_cm.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_cm.append(np.array(int(row.label_numeric)))

cm_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_cm]) # transform to torch tensors
cm_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_cm])

cm_dataset = torch.utils.data.TensorDataset(cm_tensor_x,cm_tensor_y) # create your test dataset
cm_dataloader = torch.utils.data.DataLoader(cm_dataset, batch_size=64) # create your dataloader

In [228]:
cm_tensor_x.shape

torch.Size([2679, 1144])

In [229]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in cm_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on test set: 99.1%
True positive:  322.0
False positive:  21.0
True negative:  2333
False negative:  3.0
True positive rate (recall):  0.9907692307692307
False positive rate:  0.008920985556499575
Precision:  0.9387755102040817
True negative rate (Specificity):  0.9910790144435004
F1:  0.9640718562874252
ROC-AUC:  0.9909241226063656
MCC:  0.9594134416903609


## For hivcopred dataset

In [230]:
# Create list to append data from the df
list_data_hivcopred = []
list_labels_hivcopred = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_hivcopred.iterrows():
    list_data_hivcopred.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_hivcopred.append(np.array(int(row.label_numeric)))

hivcopred_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_hivcopred]) # transform to torch tensors
hivcopred_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_hivcopred])

hivcopred_dataset = torch.utils.data.TensorDataset(hivcopred_tensor_x,hivcopred_tensor_y) # create your test dataset
hivcopred_dataloader = torch.utils.data.DataLoader(hivcopred_dataset, batch_size=64) # create your dataloader

In [231]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in hivcopred_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on hivcopred set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on hivcopred set: 98.72%
True positive:  551.0
False positive:  14.0
True negative:  1754
False negative:  16.0
True positive rate (recall):  0.9717813051146384
False positive rate:  0.007918552036199095
Precision:  0.9752212389380531
True negative rate (Specificity):  0.9920814479638009
F1:  0.9734982332155477
ROC-AUC:  0.9819313765392197
MCC:  0.9650215254489153


## For geno2pheno dataset

In [232]:
# Create list to append data from the df
list_data_geno2pheno = []
list_labels_geno2pheno = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_geno2pheno.iterrows():
    list_data_geno2pheno.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_geno2pheno.append(np.array(int(row.label_numeric)))

geno2pheno_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_geno2pheno]) # transform to torch tensors
geno2pheno_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_geno2pheno])

geno2pheno_dataset = torch.utils.data.TensorDataset(geno2pheno_tensor_x,geno2pheno_tensor_y) # create your test dataset
geno2pheno_dataloader = torch.utils.data.DataLoader(geno2pheno_dataset, batch_size=64) # create your dataloader

In [233]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in geno2pheno_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on hivcopred set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on hivcopred set: 97.98%
True positive:  199.0
False positive:  8.0
True negative:  965
False negative:  16.0
True positive rate (recall):  0.9255813953488372
False positive rate:  0.008221993833504625
Precision:  0.961352657004831
True negative rate (Specificity):  0.9917780061664954
F1:  0.943127962085308
ROC-AUC:  0.9586797007576663
MCC:  0.931098205937677


## For webpssm

In [234]:
# Create list to append data from the df
list_data_webpssm = []
list_labels_webpssm = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_webpssm.iterrows():
    list_data_webpssm.append(get_array_from_sequence(str(row.sequence_aligned)))
    list_labels_webpssm.append(np.array(int(row.label_numeric)))

webpssm_tensor_x = torch.stack([torch.from_numpy(i) for i in list_data_webpssm]) # transform to torch tensors
webpssm_tensor_y = torch.stack([torch.from_numpy(i) for i in list_labels_webpssm])

webpssm_dataset = torch.utils.data.TensorDataset(webpssm_tensor_x,webpssm_tensor_y) # create your test dataset
webpssm_dataloader = torch.utils.data.DataLoader(webpssm_dataset, batch_size=64) # create your dataloader

In [235]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in webpssm_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)

        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print(f'Neural Network accuracy on webpssm set: {round(100.0 * correct/total, 2)}%')
show_metrics(labels_array, predict_array)

Neural Network accuracy on webpssm set: 95.71%
True positive:  64.0
False positive:  4.0
True negative:  271
False negative:  11.0
True positive rate (recall):  0.8533333333333334
False positive rate:  0.014545454545454545
Precision:  0.9411764705882353
True negative rate (Specificity):  0.9854545454545455
F1:  0.8951048951048951
ROC-AUC:  0.9193939393939394
MCC:  0.8699015686885712


# Testing LeNet Architecture

In [128]:
def get_matrix_from_sequence(protein_sequence):
    """
    Function to convert a protein sequence into a tensor.
    Each amino acid is represented by an numpy array of zeros of size 26,
    and the dict_aa_pos defines the position to be converted to 1.
    
    The function iterates over the protein sequences and stacks the arrays.
    At the end the arrays are linearized and converted to a tensor of size
    n x 26, with n the size of the protein.
    
    If the character is not present on the dict_aa_pos (eg. '-') the respective
    array is formed by zeros, and represents a missing value.
    """
    dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}
    
    f_array = np.zeros(26)
    for aa in protein_sequence:
        arr = np.zeros(26)
        if dict_aa_pos.get(aa):
            arr[dict_aa_pos.get(aa)] = 1
        f_array = np.vstack((f_array, arr))
    f_array = np.delete(f_array, 0,0)
    
    #return torch.from_numpy((f_array.flatten()).astype(float))
    return f_array.astype(float).transpose()

In [94]:
# Create list to append data from the df
list_data = []
list_labels = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_labeled.iterrows():
    list_data.append(get_matrix_from_sequence(str(row.sequence_aligned)))
    list_labels.append(int(row.label_numeric))

In [96]:
# For Trainning and validation set
# Define the cross validation indices for trainning and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    trainning_data = []
    trainning_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    trainning_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in trainning_indices:
        trainning_data.append(list_data[k])
        trainning_label.append(np.array(list_labels[k]))

    trainning_tensor_x = torch.stack([torch.from_numpy(i) for i in trainning_data]) # transform to torch tensors
    trainning_tensor_y = torch.stack([torch.from_numpy(i) for i in trainning_label])

    trainning_dataset = torch.utils.data.TensorDataset(trainning_tensor_x,trainning_tensor_y) # create your test dataset
    trainning_dataloader = torch.utils.data.DataLoader(trainning_dataset, batch_size=64) # create your dataloader

Cross Validation: 1
Cross Validation: 2
Cross Validation: 3
Cross Validation: 4
Cross Validation: 5


In [92]:
get_matrix_from_sequence('CTRPNNNTRRSMRI----RPGQTFYAT-----GEIIGDIRQAYC').shape

(26, 44)

In [102]:
# Defining the network (LeNet-5)  
class LeNet5(torch.nn.Module):
     
    def __init__(self):   
        super(LeNet5, self).__init__()
        # Convolution (In LeNet-5, 32x32 images are given as input. Hence padding of 2 is done below)
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=(26,3), stride=1, padding=0, bias=True)
        # Max-pooling
        self.max_pool_1 = torch.nn.MaxPool2d(kernel_size=2)
        # Convolution
        self.conv2 = torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0, bias=True)
        # Max-pooling
        self.max_pool_2 = torch.nn.MaxPool2d(kernel_size=2)
        # Fully connected layer
        self.fc1 = torch.nn.Linear(16*5*5, 120)   # convert matrix with 16*5*5 (= 400) features to a matrix of 120 features (columns)
        self.fc2 = torch.nn.Linear(120, 84)       # convert matrix with 120 features to a matrix of 84 features (columns)
        self.fc3 = torch.nn.Linear(84, 2)        # convert matrix with 84 features to a matrix of 10 features (columns)
        
    def forward(self, x):
        # convolve, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.conv1(x))  
        # max-pooling with 2x2 grid
        x = self.max_pool_1(x)
        # convolve, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.conv2(x))
        # max-pooling with 2x2 grid
        x = self.max_pool_2(x)
        # first flatten 'max_pool_2_out' to contain 16*5*5 columns
        # read through https://stackoverflow.com/a/42482819/7551231
        x = x.view(-1, 16*5*5)
        # FC-1, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.fc1(x))
        # FC-2, then perform ReLU non-linearity
        x = torch.nn.functional.relu(self.fc2(x))
        # FC-3
        x = self.fc3(x)
        
        return x
     
#net = LeNet5()
model = LeNet5().float()
#model = model.float()
model

LeNet5(
  (conv1): Conv2d(1, 6, kernel_size=(26, 3), stride=(1, 1))
  (max_pool_1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (max_pool_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=2, bias=True)
)

In [105]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

model = Net().float()
#model = model.float()
model

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [108]:
# For Trainning and validation set DeepTropism_1
# Define the cross validation indices for trainning and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    trainning_data = []
    trainning_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    trainning_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in trainning_indices:
        trainning_data.append(list_data[k])
        trainning_label.append(np.array(list_labels[k]))

    trainning_tensor_x = torch.stack([torch.from_numpy(i) for i in trainning_data]) # transform to torch tensors
    trainning_tensor_y = torch.stack([torch.from_numpy(i) for i in trainning_label])

    trainning_dataset = torch.utils.data.TensorDataset(trainning_tensor_x,trainning_tensor_y) # create your test dataset
    trainning_dataloader = torch.utils.data.DataLoader(trainning_dataset, batch_size=64) # create your dataloader
    
    # Instantiante new model
    #model = DeepTropism_1().float()
        
    # Define Cross Validation Trainning Loop
    for epoch in range(40):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainning_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            #inputs = inputs.unsqueeze(0)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            #print(running_loss)
            #if i % 3239 == 0:    # print every 3239 mini-batches
            if epoch % 20 == 0 and i % 3248 == 0:
                print('[%d, %3d] loss: %.9f' %
                      (epoch + 1, i + 1, running_loss / 50))
                correct = 0
                total = 0
                error = 0
                labels_array = np.empty([0])
                predict_array = np.empty([0])

                with torch.no_grad():
                    for data in validation_dataloader:
                        images, labels = data
                        outputs = model(images.float())
                        _, predicted = torch.max(outputs.data, 1)

                        labels_array = np.concatenate([labels_array, labels])
                        predict_array = np.concatenate([predict_array, predicted])

                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        error += (predicted != labels).sum().item()

                print(f'Neural Network accuracy for validation set {n + 1}: {round(100.0 * correct/total, 2)}%')
                
                # Evaluate model against test set
                correct = 0
                total = 0
                error = 0
                labels_array = np.empty([0])
                predict_array = np.empty([0])

                with torch.no_grad():
                    for data in test_dataloader:
                        images, labels = data
                        outputs = model(images.float())
                        _, predicted = torch.max(outputs.data, 1)

                        labels_array = np.concatenate([labels_array, labels])
                        predict_array = np.concatenate([predict_array, predicted])

                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        error += (predicted != labels).sum().item()

                print(f'Neural Network accuracy on test set: {round(100.0 * correct/total, 2)}%')

                
            running_loss = 0.0
            
            
    torch.save(model.state_dict(), f'model_cv{n+1}.ptb')
    print('Finished Training')

Cross Validation: 1


RuntimeError: Expected 4-dimensional input for 4-dimensional weight 6 1 3, but got 3-dimensional input of size [64, 26, 44] instead

## Using Keras (TensorFlow) to train a CNN for DeepTropism

In [129]:
def get_matrix_from_sequence(protein_sequence):
    """
    Function to convert a protein sequence into a tensor.
    Each amino acid is represented by an numpy array of zeros of size 26,
    and the dict_aa_pos defines the position to be converted to 1.
    
    The function iterates over the protein sequences and stacks the arrays.
    At the end the arrays are linearized and converted to a tensor of size
    n x 26, with n the size of the protein.
    
    If the character is not present on the dict_aa_pos (eg. '-') the respective
    array is formed by zeros, and represents a missing value.
    """
    dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}
    
    f_array = np.zeros(26)
    for aa in protein_sequence:
        arr = np.zeros(26)
        if dict_aa_pos.get(aa):
            arr[dict_aa_pos.get(aa)] = 1
        f_array = np.vstack((f_array, arr))
    f_array = np.delete(f_array, 0,0)
    #return f_array.astype(float)
    return f_array.astype(float)

In [130]:
get_matrix_from_sequence('CTRPNNN-TRKRI-RIQRGPGRAFVTI-----GK-IGNMRQAHC').shape

(44, 26)

In [133]:
# Create list to append data from the df
list_data = []
list_labels = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df_unique.iterrows():
    list_labels.append(int(row.label_numeric))
    list_data.append(get_matrix_from_sequence(row.sequence_aligned))

In [134]:
np.array(list_data).shape

(3608, 44, 26)

In [95]:
# For Trainning and validation set
# Define the cross validation indices for trainning and validation sets
crossval_val_indices = np.array_split(train_val_indices,5)

# Iterate over crossval_val_indices defining the Dataloaders
for n in range(len(crossval_val_indices)):
    print(f'Cross Validation: {n + 1}')
    trainning_data = []
    trainning_label = []
    validation_data = []
    validation_label = []

    validation_indices = list(crossval_val_indices[n])
    trainning_indices = list(set(train_val_indices) - set(validation_indices))

    for j in validation_indices:
        validation_data.append(list_data[j])
        validation_label.append(np.array(list_labels[j]))

    validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
    validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

    validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your test dataset
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader
    
    for k in trainning_indices:
        trainning_data.append(list_data[k])
        trainning_label.append(np.array(list_labels[k]))

    trainning_tensor_x = torch.stack([torch.from_numpy(i) for i in trainning_data]) # transform to torch tensors
    trainning_tensor_y = torch.stack([torch.from_numpy(i) for i in trainning_label])

    trainning_dataset = torch.utils.data.TensorDataset(trainning_tensor_x,trainning_tensor_y) # create your test dataset
    trainning_dataloader = torch.utils.data.DataLoader(trainning_dataset, batch_size=64) # create your dataloader

Cross Validation: 1
Cross Validation: 2
Cross Validation: 3
Cross Validation: 4
Cross Validation: 5


In [160]:
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Conv1D(filters=38, kernel_size=7 ,
                 input_shape=(44, 26)))
model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['binary_accuracy'])
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_13 (Conv1D)           (None, 38, 38)            6954      
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 9, 38)             0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 342)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 16)                5488      
_________________________________________________________________
dense_27 (Dense)             (None, 2)                 34        
Total params: 12,476
Trainable params: 12,476
Non-trainable params: 0
_________________________________________________________________


In [161]:
history = model.fit(np.array(list_data), list_labels, 
                    epochs=50, verbose=0, validation_split=0.25)

plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.show()

ValueError: A target array with shape (3608, 1) was passed for an output of shape (None, 2) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.

In [146]:
np.array(list_data)[0].shape

(44, 26)