In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F

from sklearn import preprocessing
import pandas as pd
import numpy as np

In [2]:
def seq_to_tensor(seq):    
    seq = list(seq[0])
    le = preprocessing.LabelEncoder()
    le.fit(['A', 'T', 'C', 'G'])
    test = le.transform(seq)
    targets = torch.as_tensor(test,dtype=int)
    targets = F.one_hot(targets, num_classes=4)
    return targets.reshape(1,40,4)

In [3]:
pos_set = pd.read_csv("data/ATF2.fa",header=None).iloc[1::2]
pos_set = pos_set[pos_set[0].str.contains("N")==False]
neg_set = pd.read_csv("data/ATF2_neg.fa",header=None).iloc[1::2]
neg_set = neg_set[neg_set[0].str.contains("N")==False]


In [4]:
pos_set["seq"] = pos_set.apply(lambda x: seq_to_tensor(x), axis=1)
neg_set["seq"] = neg_set.apply(lambda x: seq_to_tensor(x), axis=1)
pos_set["label"] = torch.as_tensor(1)
neg_set["label"] = torch.as_tensor(0)
data = pos_set.append(neg_set)
data.drop(columns=[0],inplace=True)

In [5]:
data = data.sample(frac=1)
data

Unnamed: 0,seq,label
57737,"[[[tensor(0), tensor(0), tensor(1), tensor(0)]...",tensor(1)
491087,"[[[tensor(0), tensor(0), tensor(0), tensor(1)]...",tensor(1)
513579,"[[[tensor(0), tensor(0), tensor(0), tensor(1)]...",tensor(1)
115745,"[[[tensor(1), tensor(0), tensor(0), tensor(0)]...",tensor(1)
40975,"[[[tensor(1), tensor(0), tensor(0), tensor(0)]...",tensor(1)
...,...,...
61393,"[[[tensor(0), tensor(0), tensor(0), tensor(1)]...",tensor(0)
358767,"[[[tensor(0), tensor(0), tensor(0), tensor(1)]...",tensor(1)
311909,"[[[tensor(0), tensor(1), tensor(0), tensor(0)]...",tensor(0)
221915,"[[[tensor(0), tensor(0), tensor(0), tensor(1)]...",tensor(1)


In [6]:
from torch.utils.data import Dataset

class SeqDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = {'seq': self.data.iloc[idx]["seq"], 'label': self.data.iloc[idx]["label"]}
        
        return sample

In [7]:
batch_size = 32
trainset = SeqDataset(data.iloc[:int(len(data)*0.8)])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)

In [8]:
testset = SeqDataset(data.iloc[int(len(data)*0.8):])
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=0)

In [9]:
# for i, data in enumerate(trainloader, 0):
#     input_key,label_key = data
#     inputs = data[input_key]
#     labels = data[label_key]
#     print(inputs.shape)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=16, kernel_size=4,stride=1,padding=0)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
        self.conv2 = nn.Conv2d(in_channels=16,out_channels=64, kernel_size=2,stride=1,padding=0)
        self.fc1 = nn.Linear(592, 320)
        self.fc2 = nn.Linear(320, 160)
        self.fc3 = nn.Linear(160, 80)
        self.fc4 = nn.Linear(80, 2)
        self.soft = nn.Softmax()

    def forward(self, x):
        x = F.relu(self.conv1(x.float()))
        #x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.fc4(x)
        #x = self.soft(x)
        return x


net = Net()
net.to(device)

Net(
  (conv1): Conv2d(1, 16, kernel_size=(4, 4), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 64, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=592, out_features=320, bias=True)
  (fc2): Linear(in_features=320, out_features=160, bias=True)
  (fc3): Linear(in_features=160, out_features=80, bias=True)
  (fc4): Linear(in_features=80, out_features=2, bias=True)
  (soft): Softmax(dim=None)
)

In [12]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [13]:
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        input_key,label_key = data
        inputs = data[input_key]
        labels = data[label_key]

        inputs, labels = inputs.to(device), labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 0.665
[1,  4000] loss: 0.663
[1,  6000] loss: 0.665
[1,  8000] loss: 0.661
[1, 10000] loss: 0.656
[2,  2000] loss: 0.599
[2,  4000] loss: 0.551
[2,  6000] loss: 0.536
[2,  8000] loss: 0.531
[2, 10000] loss: 0.520
[3,  2000] loss: 0.517
[3,  4000] loss: 0.515
[3,  6000] loss: 0.514
[3,  8000] loss: 0.513
[3, 10000] loss: 0.510
[4,  2000] loss: 0.507
[4,  4000] loss: 0.506
[4,  6000] loss: 0.505
[4,  8000] loss: 0.504
[4, 10000] loss: 0.502
[5,  2000] loss: 0.499
[5,  4000] loss: 0.499
[5,  6000] loss: 0.498
[5,  8000] loss: 0.497
[5, 10000] loss: 0.497
[6,  2000] loss: 0.492
[6,  4000] loss: 0.490
[6,  6000] loss: 0.490
[6,  8000] loss: 0.494
[6, 10000] loss: 0.491
[7,  2000] loss: 0.483
[7,  4000] loss: 0.487
[7,  6000] loss: 0.483
[7,  8000] loss: 0.484
[7, 10000] loss: 0.485
[8,  2000] loss: 0.476
[8,  4000] loss: 0.477
[8,  6000] loss: 0.477
[8,  8000] loss: 0.478
[8, 10000] loss: 0.481
[9,  2000] loss: 0.465
[9,  4000] loss: 0.469
[9,  6000] loss: 0.472
[9,  8000] 

In [21]:
PATH = './atf2.pth'
torch.save(net.state_dict(), PATH)

In [15]:
dataiter = iter(testloader)
input_key,label_key = dataiter.next()
inputs = data[input_key]
labels = data[label_key]

In [16]:
net = Net()
net.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [17]:
outputs = net(inputs)

test, predicted = torch.max(outputs, 1)
print(test)

tensor([ 1.2419e+00,  4.8082e-01,  3.1543e-01,  1.1310e-01,  1.0642e+00,
         7.3113e-01, -1.3869e-02,  1.4972e+00,  1.7799e-01,  3.3801e+00,
         8.0673e-02,  9.7669e-01,  2.7443e-01, -5.1136e-03,  2.7757e-01,
         4.0011e-01,  8.5468e-01,  8.1810e-01,  1.1333e+00,  1.5063e-01,
         2.3719e-01,  3.0003e-01, -2.9465e-03,  3.2222e+00,  8.0507e-01,
         5.9400e-01], grad_fn=<MaxBackward0>)


In [18]:
print(predicted,labels)

tensor([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
        0, 0]) tensor([1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 0])


In [19]:
correct = 0
total = 0

# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        input_key,label_key = data
        inputs = data[input_key]
        labels = data[label_key]
        # calculate outputs by running images through the network
        outputs = net(inputs)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 74 %


In [20]:
# prepare to count predictions for each class
classes = [0,1]
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in testloader:
        input_key,label_key = data
        inputs = data[input_key]
        labels = data[label_key]

        outputs = net(inputs)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5f} is: {:.1f} %".format(classname,
                                                   accuracy))

Accuracy for class 0.000000 is: 63.5 %
Accuracy for class 1.000000 is: 80.9 %
