In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import torch
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import time
from lookahead import Lookahead
from models_n_training import *
from utilities import sampling, one_hot_encoding, curtail, get_training_data, load_data, data_split, dianostic_plots, pad_for_detector
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
if use_cuda:
    print(torch.cuda.get_device_name(0))

Tesla K80


### Loading model from checkpoint

In [8]:
batch_size = 256
_, C, L = train_x.shape
pesudo_input = torch.rand(batch_size, C, L, dtype=train_x.dtype)
model = HybridNet(pesudo_input,
                   num_filters=150, 
                   filter_size=30, 
                   rnn_size=100, 
                   fc_out1=500, fc_out2= 250,
                   dp1=0.8, dp2=0.8, dp3=0.8).to(device)

model.load_state_dict(torch.load('/home/ubuntu/data/models/hybrid_net3.state'))
model.eval()

shape after conv1d torch.Size([256, 150, 227])
shape before lstm torch.Size([256, 227, 150])
shape after lstm torch.Size([256, 227, 200])
shape after flattening torch.Size([256, 45400])


### Validation accuracy on the original validation set

In [7]:
data_x = pickle.load(open('/home/ubuntu/data/dmel_seq/data_x_nlc.np', 'rb'))
data_y = pickle.load(open('/home/ubuntu/data/dmel_seq/data_y.np', 'rb'))

train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 157)

train_x, val_x = pad_for_detector(train_x, 30), pad_for_detector(val_x, 30)
train_x.shape, train_y.shape, val_x.shape, val_y.shape

train_x, val_x = torch.from_numpy(train_x.transpose([0, 2, 1])).float(), torch.from_numpy(val_x.transpose([0, 2, 1])).float()
train_y, val_y = torch.from_numpy(train_y).float(), torch.from_numpy(val_y).float()

# Generate dataset for data loader
train_dataset = data.TensorDataset(train_x, train_y)
val_dataset = data.TensorDataset(val_x, val_y)

15780 (12624, 3388, 4) (12624,) (3156, 3388, 4) (3156,)


In [13]:
# Generate data loaders
def get_acc(y_hat, y):
    y_pred = np.where(y_hat >=0.5, 1, 0)
    return np.mean(y_pred == y)

criterion = nn.BCELoss()

val_loss_list, val_acc_list = [], []
val_loader = data.DataLoader(val_dataset, batch_size=batch_size)
total_val_steps = len(val_loader)
with torch.no_grad():
    val_loss_sum, val_acc_sum = 0, 0
    for j, (batch, labels) in enumerate(val_loader):
        batch, labels = batch.to(device), labels.to(device)
        y_hat = model(batch)
        loss = criterion(y_hat, labels)
        val_loss_sum += loss.item()
        val_acc_sum += get_acc(y_hat.cpu().detach().numpy(), labels.cpu().detach().numpy())
    avg_val_loss = val_loss_sum/total_val_steps
    avg_val_acc = val_acc_sum/total_val_steps
    val_loss_list.append(avg_val_loss)
    val_acc_list.append(avg_val_acc)
    print('[Validation loss {}, validation acc {}] \n'.format(avg_val_loss, avg_val_acc))

[Validation loss 0.6765187887045053, validation acc 0.8980511675824177] 



### Validation accuracy on a different validation set

In [14]:
train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 42)

train_x, val_x = pad_for_detector(train_x, 30), pad_for_detector(val_x, 30)
train_x.shape, train_y.shape, val_x.shape, val_y.shape

train_x, val_x = torch.from_numpy(train_x.transpose([0, 2, 1])).float(), torch.from_numpy(val_x.transpose([0, 2, 1])).float()
train_y, val_y = torch.from_numpy(train_y).float(), torch.from_numpy(val_y).float()

# Generate dataset for data loader
train_dataset = data.TensorDataset(train_x, train_y)
val_dataset = data.TensorDataset(val_x, val_y)

15780 (12624, 3388, 4) (12624,) (3156, 3388, 4) (3156,)


In [15]:
# Generate data loaders
def get_acc(y_hat, y):
    y_pred = np.where(y_hat >=0.5, 1, 0)
    return np.mean(y_pred == y)

criterion = nn.BCELoss()

val_loss_list, val_acc_list = [], []
val_loader = data.DataLoader(val_dataset, batch_size=batch_size)
total_val_steps = len(val_loader)
with torch.no_grad():
    val_loss_sum, val_acc_sum = 0, 0
    for j, (batch, labels) in enumerate(val_loader):
        batch, labels = batch.to(device), labels.to(device)
        y_hat = model(batch)
        loss = criterion(y_hat, labels)
        val_loss_sum += loss.item()
        val_acc_sum += get_acc(y_hat.cpu().detach().numpy(), labels.cpu().detach().numpy())
    avg_val_loss = val_loss_sum/total_val_steps
    avg_val_acc = val_acc_sum/total_val_steps
    val_loss_list.append(avg_val_loss)
    val_acc_list.append(avg_val_acc)
    print('[Validation loss {}, validation acc {}] \n'.format(avg_val_loss, avg_val_acc))

[Validation loss 0.1715787210716651, validation acc 0.969622825091575] 



### ROC curve (on the original validation set)

In [17]:
def hybrid_net_infer(model, seq, thresh=0.5):
    """ final score = avg(score(seq), score(seq comp))
    - Input:
    model (pytorch model): hybrid_net
    seq (torch tensor): seq to be evaluated
    thresh (float): threshold for judgement
    -Output: 1 for positive or 0 for negative
    -Note: base_pairs = {'A': [1, 0, 0, 0], 
                          'C': [0, 1, 0, 0],
                          'G': [0, 0, 1, 0],
                          'T': [0, 0, 0, 1],
                          'a': [1, 0, 0, 0],
                          'c': [0, 1, 0, 0],
                          'g': [0, 0, 1, 0],
                          't': [0, 0, 0, 1],
                          'n': [0, 0, 0, 0],
                          'N': [0, 0, 0, 0]}
    """
    char_seq = get_char_list(list(seq.cpu().numpy()))
    complement_dict = {'A': [0, 0, 0, 1], 
                       'T': [1, 0, 0, 0],
                       'C': [0, 0, 1, 0], 
                       'G': [0, 0, 1, 0]}
    seq_comp = [complement_dict[base] for base in seq]
    seq_comp = torch.Tensor(seq_comp).cuda()
    y_hat1, y_hat2 = model(seq), model(seq_comp)
    y_hat = (y_hat1. + y_hat2)/2
    if y_hat >= thresh:
        return 1
    else: 
        return 0

In [18]:
data_x = pickle.load(open('/home/ubuntu/data/dmel_seq/data_x_nlc.np', 'rb'))
data_y = pickle.load(open('/home/ubuntu/data/dmel_seq/data_y.np', 'rb'))

train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 157)

train_x, val_x = pad_for_detector(train_x, 30), pad_for_detector(val_x, 30)
train_x.shape, train_y.shape, val_x.shape, val_y.shape

train_x, val_x = torch.from_numpy(train_x.transpose([0, 2, 1])).float(), torch.from_numpy(val_x.transpose([0, 2, 1])).float()
train_y, val_y = torch.from_numpy(train_y).float(), torch.from_numpy(val_y).float()

# Generate dataset for data loader
train_dataset = data.TensorDataset(train_x, train_y)
val_dataset = data.TensorDataset(val_x, val_y)

15780 (12624, 3388, 4) (12624,) (3156, 3388, 4) (3156,)
