In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import seaborn as sns


In [3]:
ACE2_train = pd.read_csv("ACE2_train_data.csv")
ACE2_test = pd.read_csv("ACE2_test_data.csv")
LY16_train = pd.read_csv("LY16_test_data.csv")
LY16_test = pd.read_csv("LY16_train_data.csv")
LY555_train = pd.read_csv("LY555_test_data.csv")
LY555_test = pd.read_csv("LY555_train_data.csv")


In [12]:
ACE2_train
test = ACE2_train['junction_aa'].iloc[0]


In [22]:
from functools import reduce

In [43]:
unique_values = ACE2_train['junction_aa'].apply(lambda x: set(list(x))).tolist()

In [44]:
unique_values = set().union(*unique_values)

In [90]:
def vocabulary(series):
    un_val = series.apply(lambda x: set(list(x))).tolist()
    un_val = set().union(*un_val)
    return un_val

In [49]:
vocab = vocabulary(ACE2_train['junction_aa'])

In [50]:
test = ACE2_train['junction_aa'].iloc[0]

In [77]:
ACE2_train

Unnamed: 0.1,Unnamed: 0,junction_aa,consensus_count,Label,Distance
0,287261,KNAGFNCYNPLETYGFWRTGGVDW,1,1,9
1,467439,KNEQFNCYGPINAYGFQRTGGEDW,1,0,10
2,414422,KNQKFNCYVPLFHYGFWPTVGVGF,1,1,8
3,103144,KNQGFNCYNPLVNYGFYRTNGRSF,1,1,9
4,478954,KNRGFNCYKPLPGYGFQRTDGINW,2,0,9
...,...,...,...,...,...
406881,16530,KNKGFNCYIPIEDYGFQRTSGRSY,2,0,9
406882,48280,KNEGFNCYNPITEYGFWTTSGLDW,2,1,10
406883,420449,KNGKFNCYHPIVRYGFHPTVGRGY,2,1,9
406884,173734,KNGQFNCYIPIAGYGFLPTLGVSY,1,0,9


In [None]:
ACE2

In [65]:
unique_lengths = lambda x: np.unique(np.array(x.apply(list).apply(len)))

In [67]:
unique_lengths(ACE2_train['junction_aa'])

array([24])

In [None]:
unique

In [80]:
def one_hot_encode(series, aa_vocab):
    length = 24
    encoding = lambda x: np.array([[letter == aa_vocab[i] for i in range(len(aa_vocab))] for letter in x])
    
    return series.apply(encoding)

In [83]:
ACE2_train['junction_aa_encoded'] = one_hot_encode(ACE2_train['junction_aa'], list(vocab))

In [89]:
ACE2_train['junction_aa_encoded'][0].shape


(24, 20)

True

In [60]:
one_hot_encode_with_padding(ACE2_train['junction_aa'], 'lol')

24

In [20]:
test_one_hot[0]

array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 1., 0.])

In [4]:
"""
One-hot ecoding
"""

Unnamed: 0.1,Unnamed: 0,consensus_count,clonal_frequency,Label,Distance
count,26881.0,26881.0,26881.0,26881.0,26881.0
mean,22065.28124,1.767494,6.9e-05,0.500688,8.346676
std,15321.518318,7.168266,0.000421,0.500009,1.564086
min,3.0,1.0,7e-06,0.0,2.0
25%,9088.0,1.0,2.8e-05,0.0,8.0
50%,18442.0,1.0,4.7e-05,1.0,9.0
75%,34658.0,1.0,4.7e-05,1.0,9.0
max,57203.0,482.0,0.030311,1.0,14.0


In [91]:
X_train = torch.tensor(np.stack(ACE2_train['junction_aa_encoded'].values), dtype=torch.float32)
y_train = torch.tensor(ACE2_train['Label'].values, dtype=torch.long)

In [93]:
ACE2_test['junction_aa_encoded'] = one_hot_encode(ACE2_test['junction_aa'], list(vocab))


In [94]:
X_test = torch.tensor(np.stack(ACE2_test['junction_aa_encoded'].values), dtype=torch.float32)
y_test = torch.tensor(ACE2_test['Label'].values, dtype=torch.long)

In [133]:
from torch.utils.data import Dataset, DataLoader

class OneHotDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        X_data = self.X[idx]
        X_data = torch.unsqueeze(X_data, 0)  # Add an extra dimension at position 0
        y_data = self.y[idx]
        
        return X_data, y_data

    
train_dataset = OneHotDataset(X_train, y_train)
test_dataset = OneHotDataset(X_test, y_test)


In [134]:
"""
Data loaders:
"""
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [141]:
"""
Class, 24x20 input 
"""
class ConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=(3, 3)) # 22x18
        self.bn1 = nn.BatchNorm2d(6)
        self.pool1 = nn.MaxPool2d(kernel_size=(2,2)) # 11x9xx6
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=9, kernel_size=(3, 3)) # 9x7xx9
        self.bn2 = nn.BatchNorm2d(9)
        self.flatten = nn.Flatten() # 9*7*9
        
        self.fc1 = nn.Linear(9*7*9, 64) # HARDCODE!!!! <<<<<
        self.bn3 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 10)
        self.bn4 = nn.BatchNorm1d(10)
        self.out = nn.Linear(10, 1)
        
    def forward(self, x):
        
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = self.flatten(x)
        
        x = F.relu(self.fc1(x))
        x = self.bn3(x)
        x = F.relu(self.fc2(x))
        x = self.bn4(x)
        x = self.out(x)
        
        return x
        

In [142]:
model = ConvNet()

In [143]:
criterion = F.binary_cross_entropy_with_logits
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [144]:
import os

# этот код создает папку на диске с названием 'logs'
if not os.path.exists('logs'):
    os.mkdir('logs')

In [145]:
%load_ext tensorboard

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("logs")

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [146]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("logs")

In [147]:
%tensorboard --logdir=./logs

Reusing TensorBoard on port 6006 (pid 21307), started 0:05:42 ago. (Use '!kill 21307' to kill it.)

In [149]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [153]:
for i, batch in enumerate(train_dataloader):
    # так получаем текущий батч
    X_batch, y_batch = batch
    break
    
X_batch.shape

torch.Size([64, 1, 24, 20])

In [166]:
def evaluate(model, dataloader, criterion):
    
    losses = []

    num_correct = 0
    num_elements = len(dataloader)

    for i, batch in enumerate(dataloader):
        
        # так получаем текущий батч
        X_batch, y_batch = batch
        
        with torch.no_grad():
            logits = model(X_batch.to(device))
            
            loss = criterion(logits, y_batch.to(device))
            losses.append(loss.item())
            
            y_pred = torch.argmax(logits, dim=1).cpu()
            
            num_correct += torch.sum(y_pred == y_batch)
    
    accuracy = num_correct / num_elements
            
    return accuracy, np.mean(losses)



def train(model, loss_fn, optimizer, n_epoch=3):

    num_iter = 0
    
    # цикл обучения сети
    for epoch in range(n_epoch):

        print("Epoch:", epoch)

        model.train(True)
        
        for i, batch in enumerate(train_dataloader):
            # так получаем текущий батч
            X_batch, y_batch = batch 
            
            # forward pass (получение ответов на батч картинок)
            logits = model(X_batch.to(device)) 
            
            # вычисление лосса от выданных сетью ответов и правильных ответов на батч
            loss = loss_fn(logits.flatten(), y_batch.float().to(device)) 
            
            
            loss.backward() # backpropagation (вычисление градиентов)
            optimizer.step() # обновление весов сети
            optimizer.zero_grad() # обнуляем веса

            #########################
            # Логирование результатов
            num_iter += 1
            writer.add_scalar('Loss/train', loss.item(), num_iter)

            # вычислим accuracy на текущем train батче
            model_answers = torch.argmax(logits, dim=1).cpu()
            train_accuracy = torch.sum(y_batch == model_answers) / len(y_batch)
            writer.add_scalar('Accuracy/train', train_accuracy, num_iter)
            #########################

        # после каждой эпохи получаем метрику качества на валидационной выборке
        model.train(False)

        val_accuracy, val_loss = evaluate(model, test_dataloader, criterion=criterion)

        writer.add_scalar('Loss/val', val_loss.item(), num_iter)
        writer.add_scalar('Accuracy/val', val_accuracy, num_iter)
        
        
    return model


In [167]:
model = train(model, criterion, optimizer, n_epoch=10)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9


In [None]:
"""
Evaluate quality metrics (copypaste continues)
"""
train_accuracy, _ = evaluate(model, train_dataloader, criterion)
print('Train accuracy is', train_accuracy)

test_accuracy, _ = evaluate(model, test_dataloader, criterion)
print('Test accuracy is', test_accuracy)

!tensorboard dev upload --logdir=./logs \
--name "My latest experiment" \
--description "Simple comparison of several hyperparameters"

Train accuracy is tensor(0.0055)
Test accuracy is tensor(0.0495)
TensorFlow installation not found - running with reduced feature set.

***** TensorBoard Uploader *****

This will upload your TensorBoard logs to https://tensorboard.dev/ from
the following directory:

./logs

This TensorBoard will be visible to everyone. Do not upload sensitive
data.

Your use of this service is subject to Google's Terms of Service
<https://policies.google.com/terms> and Privacy Policy
<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service
<https://tensorboard.dev/policy/terms/>.

This notice will not be shown again while you are logged into the uploader.
To log out, run `tensorboard dev auth revoke`.

Continue? (yes/NO) 