In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import math
import torch
from torch.utils import model_zoo
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import time
from models_n_training import *
from utilities import sampling, one_hot_encoding, curtail, get_training_data, load_data, data_split, dianostic_plots, pad_for_detector
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
if use_cuda:
    print(torch.cuda.get_device_name(0))

Tesla K80


In [3]:
# Subset sampling: save to me_samples; "me" for "mutually exclusive"
output_folder_path = "../../../../temp/buffers/me_samples"

data_dir = "/home/ubuntu/group_volume/team_neural_network/data/input/3.24_species_only"

len(os.listdir(data_dir)) # total number of regions

3543

Uncomment the following code chunk to resample training and validation data

In [4]:
# !rm -r /home/ubuntu/data/temp/train
# !mkdir /home/ubuntu/data/temp/train
# !rm -r /home/ubuntu/data/temp/val
# !mkdir /home/ubuntu/data/temp/val

# ###############################################################################
# # Sample training and validation data
# # IMPORTANT: Make sure that training and validation don't have intersection!!!
# ###############################################################################
# all_data_lst = np.array(os.listdir(data_dir))
# n = len(all_data_lst)
# num_trained_regions = int(n * 0.8)
# train_files = all_data_lst[:num_trained_regions]
# num_val = n - num_trained_regions
# val_indices = np.random.choice(np.arange(num_trained_regions, n), num_val, replace = False)
# val_files = all_data_lst[val_indices]

# train_dest = '/home/ubuntu/data/temp/train/'
# for file in train_files:
#     shutil.copy(os.path.join(data_dir, file),
#                           train_dest)
# print('copied training samples to {}'.format(train_dest))

# val_dest = '/home/ubuntu/data/temp/val/'
# for file in val_files:
#     shutil.copy(os.path.join(data_dir, file),
#                           val_dest)
# print('copied validation samples to {}'.format(val_dest))

# # Preprocess train and val data so that they are ready to be fed to models
# train_output_path = os.path.join(output_folder_path, 'train.data')
# val_output_path = os.path.join(output_folder_path, 'val.data')

# train_regions = one_hot_encoding(train_dest, train_output_path)
# val_regions = one_hot_encoding(val_dest, val_output_path)
# train_x, train_y = get_training_data(train_regions, output_folder_path,
#                                    max_len = 1000, 
#                                    train_x_name = 'train_x.data', 
#                                    train_y_name = 'train_y.data')
# val_x, val_y = get_training_data(val_regions, output_folder_path,
#                                    max_len = 1000, 
#                                    train_x_name = 'val_x.data', 
#                                    train_y_name = 'val_y.data')
# # Pad for motif detectors
# train_x, val_x = pad_for_detector(train_x, 15), pad_for_detector(val_x, 15)

In [5]:
# data_x = pickle.load(open('../../../../temp/buffers/ss_samples/train_x.data', 'rb'))
# data_y = pickle.load(open('../../../../temp/buffers/ss_samples/train_y.data', 'rb'))
# train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 157)
# train_x, val_x = pad_for_detector(train_x, 10), pad_for_detector(val_x, 10)

In [6]:
train_x = pickle.load(open(os.path.join(output_folder_path, 'train_x.data'), 'rb'))
train_y = pickle.load(open(os.path.join(output_folder_path, 'train_y.data'), 'rb'))
val_x = pickle.load(open(os.path.join(output_folder_path, 'val_x.data'), 'rb'))
val_y = pickle.load(open(os.path.join(output_folder_path, 'val_y.data'), 'rb'))

In [7]:
n = 15 # number of filters
m = 30 # filter size
train_x, val_x = pad_for_detector(train_x, m), pad_for_detector(val_x, m)

In [8]:
train_x, val_x = torch.from_numpy(train_x.transpose([0, 2, 1])).float(), torch.from_numpy(val_x.transpose([0, 2, 1])).float()
train_y, val_y = torch.from_numpy(train_y).float(), torch.from_numpy(val_y).float()

# Generate dataset for data loader
train_dataset = data.TensorDataset(train_x, train_y)
val_dataset = data.TensorDataset(val_x, val_y)

In [9]:
class HybridNet(nn.Module):
    
    def __init__(self, pesudo_input, num_filters, filter_size, rnn_size, fc_out, dp1, dp2, 
                 num_rnn_layers=1, rnn_dropout=0):
        super(HybridNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=num_filters, kernel_size=filter_size)
        out = self.conv1(pesudo_input)
        out = nn.MaxPool1d(kernel_size=15, stride=15)(out)
        ####################################################
        print('shape after conv1d {}'.format(out.shape))
        N, C, T = out.shape
        out = torch.transpose(out, 1, 2)
        print('shape before lstm {}'.format(out.shape))
        ####################################################
        # or input_size*seq_len
        self.bi_lstm = nn.LSTM(input_size=C, hidden_size=rnn_size, num_layers=num_rnn_layers,
                              batch_first=True, dropout=rnn_dropout, bidirectional=True)
        out, _ = self.bi_lstm(out)
        print('shape after lstm {}'.format(out.shape))
        N, T, C = out.shape
        #out = torch.transpose(out, 1, 2)
        out = out.reshape(N, -1)
        print('shape after flattening {}'.format(out.shape))
        self.fc1 = nn.Linear(T*C, fc_out, bias=True)
        self.fc2 = nn.Linear(fc_out, 1)
        self.p1 = dp1
        self.p2 = dp2
        
    def forward(self, seq):
        self.activation_seq = F.relu(self.conv1(seq))
        out = nn.MaxPool1d(kernel_size=15, stride=15)(self.activation_seq)
        
#         out = nn.Dropout(p=self.p1)(self.activation_seq)
        
        #################################################################################
        # Input of LSTM layer should have shape (sequence_length, batch_size, input_size)
        #     - Sequence length here should be the length of activation after downsampling
        #     - Input size should be the number of filters
        #################################################################################
        N, C, T = out.shape
#         out = out.view(bs, 1, -1)
        out = torch.transpose(out, 1, 2)
        out, _ = self.bi_lstm(out)
        out = F.relu(out)
        
        #################################################################################
        # Need to flatten the sequence before feeding them into fully connected layer
        #################################################################################
        N, T, C = out.shape
        #out = torch.transpose(out, 1, 2)
        out = out.reshape(N, -1)
        out = self.fc1(out)
        out = nn.Dropout(p=self.p1)(out)
        out = F.relu(out)
        out = self.fc2(out)
        out = nn.Dropout(p=self.p2)(out)
        out = torch.squeeze(out)
        return nn.Sigmoid()(out)

In [None]:
batch_size = 256
_, C, L = train_x.shape
pesudo_input = torch.rand(batch_size, C, L, dtype=train_x.dtype)
hybrid_net = HybridNet(pesudo_input,
                       num_filters=50, 
                       filter_size=10, 
                       rnn_size=50, 
                       fc_out=20, 
                       dp1=0.8, dp2=0.8).to(device)

optimizers = {'adam': torch.optim.Adam(hybrid_net.parameters(), lr=1e-3, weight_decay=1e-4),
              'rmsprop': torch.optim.RMSprop(hybrid_net.parameters(), lr=1e-3, weight_decay=1e-4)}
config = {'epochs':250, 'device':device, 
          'opt': optimizers['adam'],
          'criterion':nn.BCELoss(),
          'batch_size': batch_size,
          'log_interval':1}

hybrid_net, train_loss_list, val_loss_list, train_acc_list, val_acc_list = train(hybrid_net, train_dataset, val_dataset, config)

shape after conv1d torch.Size([256, 50, 69])
shape before lstm torch.Size([256, 69, 50])
shape after lstm torch.Size([256, 69, 100])
shape after flattening torch.Size([256, 6900])
Train on 33984 samples, validate on 8520 samples
***************************************
Epoch 1: training loss 0.6945885285399014, training acc 0.45184249686716793
Time: 8.944575309753418 

[Validation loss 0.6917566727189457, validation acc 0.5013148488562091] 

***************************************
Epoch 2: training loss 0.6919588666213187, training acc 0.46284656954887216
Time: 8.870245218276978 

[Validation loss 0.6917154613663169, validation acc 0.5055147058823529] 

***************************************
Epoch 3: training loss 0.6914251841100535, training acc 0.46656680764411024
Time: 8.873063564300537 

[Validation loss 0.6928302733337178, validation acc 0.4980596405228758] 



In [None]:
log_interval = 1
dianostic_plots(train_acc_list[::log_interval], train_loss_list[::log_interval], val_acc_list, val_loss_list)