In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import seaborn as sns


In [3]:
"""
Loading data.
"""

ACE2_train = pd.read_csv("ACE2_train_data.csv")
ACE2_test = pd.read_csv("ACE2_test_data.csv")
LY16_test = pd.read_csv("LY16_test_data.csv")
LY16_train = pd.read_csv("LY16_train_data.csv")
LY555_test = pd.read_csv("LY555_test_data.csv")
LY555_train = pd.read_csv("LY555_train_data.csv")
REGN33_train = pd.read_csv("REGN33_train_data.csv")
REGN87_train = pd.read_csv("REGN87_train_data.csv")


"""
This function creates a vocabulary of letters in the dataset
"""
def vocabulary(series):
    un_val = series.apply(lambda x: set(list(x))).tolist()
    un_val = set().union(*un_val)
    return un_val

vocab = vocabulary(ACE2_train['junction_aa'])

"""
One-hot encoding. No padding is needed, because all strings have a length of 24
"""
def one_hot_encode(series, aa_vocab):
    length = 24
    encoding = lambda x: np.array([[letter == aa_vocab[i] for i in range(len(aa_vocab))] for letter in x])
    
    return series.apply(encoding)

In [17]:
len(LY555_train)/len(LY555_test)

8.996422182468693

In [5]:
"""
Pre-process test and train data
"""
ACE2_train['junction_aa_encoded'] = one_hot_encode(ACE2_train['junction_aa'], list(vocab))
ACE2_test['junction_aa_encoded'] = one_hot_encode(ACE2_test['junction_aa'], list(vocab))

X_train = torch.tensor(np.stack(ACE2_train['junction_aa_encoded'].values), dtype=torch.float32)
y_train = torch.tensor(ACE2_train['Label'].values, dtype=torch.long)
X_test = torch.tensor(np.stack(ACE2_test['junction_aa_encoded'].values), dtype=torch.float32)
y_test = torch.tensor(ACE2_test['Label'].values, dtype=torch.long)


In [7]:
"""
Create Datasets
"""
from torch.utils.data import Dataset, DataLoader

class OneHotDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        X_data = self.X[idx]
        X_data = torch.unsqueeze(X_data, 0)  # Add an extra dimension at position 0
                                             # for each batch to have the shape [64, 1, 24, 20]
        y_data = self.y[idx]
        
        return X_data, y_data

    
train_dataset = OneHotDataset(X_train, y_train)
test_dataset = OneHotDataset(X_test, y_test)


In [8]:
"""
Data loaders:
"""
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [9]:
"""
CNN, 1x24x20 input. Kernels of size (3, 3), 2 layers, with BatchNorm, 6 and 9 channels. 
All the sizes are hardcoded for now.
"""
class ConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=(3, 3)) # 22x18
        self.bn1 = nn.BatchNorm2d(6)
        self.pool1 = nn.MaxPool2d(kernel_size=(2,2)) # 11x9xx6
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=9, kernel_size=(3, 3)) # 9x7xx9
        self.bn2 = nn.BatchNorm2d(9)
        self.flatten = nn.Flatten() # 9*7*9
        
        self.fc1 = nn.Linear(9*7*9, 64) # HARDCODE!!!! <<<<<
        self.bn3 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 10)
        self.bn4 = nn.BatchNorm1d(10)
        self.out = nn.Linear(10, 1)
        
    def forward(self, x):
        
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = self.flatten(x)
        
        x = F.relu(self.fc1(x))
        x = self.bn3(x)
        x = F.relu(self.fc2(x))
        x = self.bn4(x)
        x = self.out(x)
        
        return x

In [11]:
"""
Model setup
"""
model = ConvNet()
criterion = F.binary_cross_entropy_with_logits
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [12]:
"""
Tracking the training process (copypasted from MSU lectures on DL)
"""

import os

# этот код создает папку на диске с названием 'logs'
if not os.path.exists('logs'):
    os.mkdir('logs')
    
%load_ext tensorboard

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("logs")

%tensorboard --logdir=./logs



Reusing TensorBoard on port 6006 (pid 21307), started 0:21:14 ago. (Use '!kill 21307' to kill it.)

In [15]:
"""
Another minutely changed copypaste
"""

def evaluate(model, dataloader, criterion):
    
    losses = []

    num_correct = 0
    num_elements = len(dataloader)

    for i, batch in enumerate(dataloader):
        
        # так получаем текущий батч
        X_batch, y_batch = batch
        
        with torch.no_grad():
            logits = model(X_batch.to(device))
            
            loss = criterion(logits.flatten(), y_batch.float().to(device))
            losses.append(loss.item())
            
            y_pred = torch.argmax(logits, dim=1).cpu()
            
            num_correct += torch.sum(y_pred == y_batch)
    
    accuracy = num_correct / num_elements
            
    return accuracy, np.mean(losses)



def train(model, loss_fn, optimizer, n_epoch=3):

    num_iter = 0
    
    # цикл обучения сети
    for epoch in range(n_epoch):

        print("Epoch:", epoch)

        model.train(True)
        
        for i, batch in enumerate(train_dataloader):
            # так получаем текущий батч
            X_batch, y_batch = batch 
            
            # forward pass (получение ответов на батч картинок)
            logits = model(X_batch.to(device)) 
            
            # вычисление лосса от выданных сетью ответов и правильных ответов на батч
            loss = loss_fn(logits.flatten(), y_batch.float().to(device)) 
            
            
            loss.backward() # backpropagation (вычисление градиентов)
            optimizer.step() # обновление весов сети
            optimizer.zero_grad() # обнуляем веса

            #########################
            # Логирование результатов
            num_iter += 1
            writer.add_scalar('Loss/train', loss.item(), num_iter)

            # вычислим accuracy на текущем train батче
            model_answers = torch.argmax(logits, dim=1).cpu()
            train_accuracy = torch.sum(y_batch == model_answers) / len(y_batch)
            writer.add_scalar('Accuracy/train', train_accuracy, num_iter)
            #########################

        # после каждой эпохи получаем метрику качества на валидационной выборке
        model.train(False)

        val_accuracy, val_loss = evaluate(model, test_dataloader, criterion=criterion)

        writer.add_scalar('Loss/val', val_loss.item(), num_iter)
        writer.add_scalar('Accuracy/val', val_accuracy, num_iter)
        
        
    return model


In [None]:
"""
Train the network
"""
model = train(model, criterion, optimizer, n_epoch=10)

In [None]:
"""
Evaluate quality metrics (copypaste continues)
"""
train_accuracy, _ = evaluate(model, train_dataloader, criterion)
print('Train accuracy is', train_accuracy)

test_accuracy, _ = evaluate(model, test_dataloader, criterion)
print('Test accuracy is', test_accuracy)

!tensorboard dev upload --logdir=./logs \
--name "My latest experiment" \
--description "Simple comparison of several hyperparameters"