In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F

from sklearn import preprocessing
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=16, kernel_size=4,stride=1,padding=0)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
        self.conv2 = nn.Conv2d(in_channels=16,out_channels=64, kernel_size=2,stride=1,padding=0)
        self.fc1 = nn.Linear(592, 320)
        self.fc2 = nn.Linear(320, 160)
        self.fc3 = nn.Linear(160, 80)
        self.fc4 = nn.Linear(80, 2)
        self.soft = nn.Softmax()

    def forward(self, x):
        x = F.relu(self.conv1(x.float()))
        #x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.fc4(x)
        #x = self.soft(x)
        return x

In [3]:
class SeqDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = {'seq': self.data.iloc[idx]["seq"], 'label': self.data.iloc[idx]["label"]}
        
        return sample

In [4]:
def seq_to_tensor(seq):    
    seq = list(seq[0])
    le = preprocessing.LabelEncoder()
    le.fit(['A', 'T', 'C', 'G'])
    test = le.transform(seq)
    targets = torch.as_tensor(test,dtype=int)
    targets = F.one_hot(targets, num_classes=4)
    return targets.reshape(1,40,4)

In [9]:
def create_data(tf):
    pos_set = pd.read_csv("data/"+tf+".fa",header=None).iloc[1::2]
    pos_set = pos_set[pos_set[0].str.contains("N")==False]
    neg_set = pd.read_csv("data/"+tf+"_neg.fa",header=None).iloc[1::2]
    neg_set = neg_set[neg_set[0].str.contains("N")==False]
    pos_set["seq"] = pos_set.apply(lambda x: seq_to_tensor(x), axis=1)
    neg_set["seq"] = neg_set.apply(lambda x: seq_to_tensor(x), axis=1)
    pos_set["label"] = torch.as_tensor(1)
    neg_set["label"] = torch.as_tensor(0)
    data = pos_set.append(neg_set)
    data.drop(columns=[0],inplace=True)
    data = data.sample(frac=1)
    return data

In [10]:
def get_train_test(data,batchsize=1,split=0.2):
    batch_size = batchsize
    trainset = SeqDataset(data.iloc[:int(len(data)*(1-split))])
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
    testset = SeqDataset(data.iloc[int(len(data)*(1-split)):])
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=0)
    return trainloader,testloader

In [11]:
tfs = ["ALX1","ALX3","DLX1"]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [12]:
for tf in tfs:
    data = create_data(tf)
    trainloader,testloader = get_train_test(data,batchsize=16,split=0.2)

    net = Net()
    net.to(device)
        
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    for epoch in range(10):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            input_key,label_key = data
            inputs = data[input_key]
            labels = data[label_key]

            inputs, labels = inputs.to(device), labels.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training', tf)
    torch.save(net.state_dict(), './models/'+tf+'.pth')


[1,  2000] loss: 0.440
[1,  4000] loss: 0.252
[1,  6000] loss: 0.185
[1,  8000] loss: 0.162
[1, 10000] loss: 0.148
[1, 12000] loss: 0.141
[1, 14000] loss: 0.137
[1, 16000] loss: 0.135
[1, 18000] loss: 0.128
[1, 20000] loss: 0.131
[2,  2000] loss: 0.122
[2,  4000] loss: 0.120
[2,  6000] loss: 0.122
[2,  8000] loss: 0.122
[2, 10000] loss: 0.118
[2, 12000] loss: 0.123
[2, 14000] loss: 0.119
[2, 16000] loss: 0.119
[2, 18000] loss: 0.117
[2, 20000] loss: 0.118
[3,  2000] loss: 0.113
[3,  4000] loss: 0.116
[3,  6000] loss: 0.116
[3,  8000] loss: 0.111
[3, 10000] loss: 0.117
[3, 12000] loss: 0.112
[3, 14000] loss: 0.116
[3, 16000] loss: 0.111
[3, 18000] loss: 0.112
[3, 20000] loss: 0.107
[4,  2000] loss: 0.111
[4,  4000] loss: 0.107
[4,  6000] loss: 0.105
[4,  8000] loss: 0.107
[4, 10000] loss: 0.105
[4, 12000] loss: 0.110
[4, 14000] loss: 0.112
[4, 16000] loss: 0.109
[4, 18000] loss: 0.110
[4, 20000] loss: 0.107
[5,  2000] loss: 0.103
[5,  4000] loss: 0.104
[5,  6000] loss: 0.103
[5,  8000] 

In [17]:
for tf in tfs:
    print(tf)
    data = create_data(tf)
    trainloader,testloader = get_train_test(data,batchsize=16,split=0.2)

    net = Net()
    net.load_state_dict(torch.load('./models/'+tf+'.pth'))

    correct = 0
    total = 0
    classes = [0,1]
    correct_pred = {classname: 0 for classname in classes}
    total_pred = {classname: 0 for classname in classes}

    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            input_key,label_key = data
            inputs = data[input_key]
            labels = data[label_key]
            # calculate outputs by running images through the network
            outputs = net(inputs)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            for label, prediction in zip(labels, predicted):
                if label == prediction:
                    correct_pred[classes[label]] += 1
                total_pred[classes[label]] += 1

    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))

    for classname, correct_count in correct_pred.items():
        accuracy = 100 * float(correct_count) / total_pred[classname]
        print("Accuracy for class {:5f} is: {:.1f} %".format(classname, accuracy))

ALX1
Accuracy of the network on the 10000 test images: 96 %
Accuracy for class 0.000000 is: 96.8 %
Accuracy for class 1.000000 is: 96.7 %
ALX3
Accuracy of the network on the 10000 test images: 94 %
Accuracy for class 0.000000 is: 91.7 %
Accuracy for class 1.000000 is: 95.5 %
DLX1
Accuracy of the network on the 10000 test images: 92 %
Accuracy for class 0.000000 is: 93.0 %
Accuracy for class 1.000000 is: 92.7 %
