In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import sklearn.metrics

In [2]:
folder = "MLNSassignment_data/"

with open(folder+"allData_trn.txt", 'r') as f:
    train_set = [line.strip() for line in f]
    
with open(folder+"allLabels_trn.txt", 'r') as f:
    train_labels = [line.strip() for line in f]

with open(folder+"allData_val.txt", 'r') as f:
    val_set = [line.strip() for line in f]

with open(folder+"allLabels_val.txt", 'r') as f:
    val_labels = [line.strip() for line in f]
    
with open(folder+"allData_tst.txt", 'r') as f:
    test_set = [line.strip() for line in f]

In [3]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(['A', 'T', 'G', 'C'])
onehot_encoder = OneHotEncoder(sparse=False)

train_data = []
le_train = []

for seq in train_set:
    integer_encoded_train = label_encoder.transform(list(seq))
    integer_encoded_train = integer_encoded_train.reshape(len(integer_encoded_train), 1)
    onehot_encoded_train = onehot_encoder.fit_transform(integer_encoded_train)
    train_data.append([onehot_encoded_train.T])
    le_train.append(integer_encoded_train.T)
    
val_data = []
le_val = []

for seq in val_set:
    integer_encoded_val = label_encoder.transform(list(seq))
    integer_encoded_val = integer_encoded_val.reshape(len(integer_encoded_val), 1)
    onehot_encoded_val = onehot_encoder.fit_transform(integer_encoded_val)
    val_data.append([onehot_encoded_val.T])
    le_val.append(integer_encoded_val.T)
    
test_data = []
le_test = []

for seq in test_set:
    integer_encoded_test = label_encoder.transform(list(seq))
    integer_encoded_test = integer_encoded_test.reshape(len(integer_encoded_test), 1)
    onehot_encoded_test = onehot_encoder.fit_transform(integer_encoded_test)
    test_data.append([onehot_encoded_test.T])
    le_test.append(integer_encoded_test.T)

In [4]:
le_train[0].shape

(1, 900)

In [5]:
print(np.asarray(train_data).shape)
print(np.asarray(val_data).shape)
print(np.asarray(test_data).shape)

print(np.asarray(le_train).shape)
print(np.asarray(le_val).shape)
print(np.asarray(le_test).shape)

train_data = torch.FloatTensor(train_data)
val_data = torch.FloatTensor(val_data)
test_data = torch.FloatTensor(test_data)

(4000, 1, 4, 900)
(4000, 1, 4, 900)
(2000, 1, 4, 900)
(4000, 1, 900)
(4000, 1, 900)
(2000, 1, 900)


In [6]:
trainlabels = []
for x in train_labels:
    trainlabels.append(int(x))
    
vallabels = []
for x in val_labels:
    vallabels.append(int(x))

In [7]:
train_set = map(list, zip(train_data, trainlabels))
train_set = list(train_set)

val_set = map(list, zip(val_data, vallabels))
val_set = list(val_set)

# len(train_set)
# print(train_data)

In [8]:
trainloader = torch.utils.data.DataLoader(train_set, batch_size=5, shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(val_set, batch_size=5, shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(test_data, batch_size=5, shuffle=False, num_workers=2)
len(trainloader)

800

In [9]:
class Cnn(nn.Module):
    def __init__(self):
        super(Cnn, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, (1, 5))
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, (1, 5))
        self.fc1 = nn.Linear(16 * 1 * 222, 120)
        self.fc2 = nn.Linear(120, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool1(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 1 * 222)
        x = F.relu(self.fc1(x))
        x = self.softmax(self.fc2(x))
        return x

In [10]:
num_epochs = 40
batch_size = 5

model = Cnn()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=1e-3, momentum = 0.9)

print(model)

print(len(list(model.parameters())))
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())
    
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

len(trainloader)

Cnn(
  (conv1): Conv2d(1, 6, kernel_size=(1, 5), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(1, 5), stride=(1, 1))
  (fc1): Linear(in_features=3552, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)
8
torch.Size([6, 1, 1, 5])
torch.Size([6])
torch.Size([16, 6, 1, 5])
torch.Size([16])
torch.Size([120, 3552])
torch.Size([120])
torch.Size([2, 120])
torch.Size([2])
The model has 427,134 trainable parameters


800

In [13]:
train_losses = []
val_losses = []

best_model = None
best_loss = 1000

for epoch in range(num_epochs): 
    running_loss = 0.0
    loss_acc = 0.0
    for i, data in enumerate(trainloader):
        # get the inputs; data is a list of [inputs, labels]
        x_train, y_train = data
        x_train = x_train.float()
        
        # zero the parameter gradients
        optimizer.zero_grad()
        output_train = model(x_train)
        
#         print(output_train)
#         print(y_train)
        
        loss_train = criterion(output_train.squeeze(), y_train)

        loss_acc += loss_train.item()

        loss_train.backward()
        optimizer.step()

        # print statistics
        running_loss += loss_train.item()
        if i % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0
    train_losses.append(loss_acc/800)
    
    val_running_loss = 0.0
    val_loss_acc = 0.0
    with torch.no_grad():
        for i, data in enumerate(valloader):
            # get the inputs; data is a list of [inputs, labels]
            x_val, y_val = data
            x_val = x_val.float()
        
            output_val = model(x_val)
        
            loss_val = criterion(output_val.squeeze(), y_val)
            val_loss_acc += loss_val.item()

            # print statistics
            val_running_loss += loss_val.item()
            if i % 200 == 199:
                print('[%d, %5d] Val loss: %.3f' %
                      (epoch + 1, i + 1, val_running_loss / 200))
                val_running_loss = 0.0
    
        if val_loss_acc < best_loss:
            best_loss = val_loss_acc
            best_model = model
        
        val_losses.append(val_loss_acc/800)

[1,   200] loss: 0.693
[1,   400] loss: 0.693
[1,   600] loss: 0.692


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# plotting the training and validation loss
plt.xlabel('epochs', fontsize=18)
plt.ylabel('loss', fontsize=18)
plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')
plt.legend()
plt.show()


In [None]:
correct = 0
total = 0
outputl = []
outputs = []

with torch.no_grad():
    for data in valloader:
        inputs, labels = data
        inputs = inputs.float()
        output = best_model(inputs)
        
        _, predicted = torch.max(output.data, 1)
        for pred in predicted:
                outputl.append(pred)
                
        outputs.extend(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('F1 score: ', sklearn.metrics.average_precision_score(vallabels, outputl, average='macro', pos_label=1))
print('AUPRC: ', sklearn.metrics.average_precision_score(vallabels, outputl, average='macro', pos_label=1))
print('Accuracy of the network on the test images: %d %%' % (
    100 * correct / total))

In [79]:
# Test the model
model.eval()
outputs = []
with torch.no_grad():
    for seq in testloader:
        
        with torch.no_grad():
            output = best_model(seq)
            output = output.squeeze()
            _, predicted = torch.max(output.data, 1)
            for pred in predicted:
                outputs.append(pred)

In [80]:
f = open("cnn_result.txt", "a")
for output in outputs:
    f.write(str(output.item()))
    f.write("\n")
f.close()