In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import math
import torch
from torch.utils import model_zoo
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import time
from utilities import sampling, one_hot_encoding, curtail, get_training_data, load_data, data_split, dianostic_plots, pad_for_detector
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
if use_cuda:
    print(torch.cuda.get_device_name(0))

Tesla K80


In [3]:
# Subset sampling: save to me_samples; "me" for "mutually exclusive"
output_folder_path = "../../../../temp/buffers/me_samples"

data_dir = "/home/ubuntu/group_volume/team_neural_network/data/input/3.24_species_only"

len(os.listdir(data_dir)) # total number of regions

3543

Uncomment the following code chunk to resample training and validation data

In [4]:
# !rm -r /home/ubuntu/data/temp/train
# !mkdir /home/ubuntu/data/temp/train
# !rm -r /home/ubuntu/data/temp/val
# !mkdir /home/ubuntu/data/temp/val

# ###############################################################################
# # Sample training and validation data
# # IMPORTANT: Make sure that training and validation don't have intersection!!!
# ###############################################################################
# all_data_lst = np.array(os.listdir(data_dir))
# n = len(all_data_lst)
# num_trained_regions = int(n * 0.8)
# train_files = all_data_lst[:num_trained_regions]
# num_val = n - num_trained_regions
# val_indices = np.random.choice(np.arange(num_trained_regions, n), num_val, replace = False)
# val_files = all_data_lst[val_indices]

# train_dest = '/home/ubuntu/data/temp/train/'
# for file in train_files:
#     shutil.copy(os.path.join(data_dir, file),
#                           train_dest)
# print('copied training samples to {}'.format(train_dest))

# val_dest = '/home/ubuntu/data/temp/val/'
# for file in val_files:
#     shutil.copy(os.path.join(data_dir, file),
#                           val_dest)
# print('copied validation samples to {}'.format(val_dest))

# # Preprocess train and val data so that they are ready to be fed to models
# train_output_path = os.path.join(output_folder_path, 'train.data')
# val_output_path = os.path.join(output_folder_path, 'val.data')

# train_regions = one_hot_encoding(train_dest, train_output_path)
# val_regions = one_hot_encoding(val_dest, val_output_path)
# train_x, train_y = get_training_data(train_regions, output_folder_path,
#                                    max_len = 1000, 
#                                    train_x_name = 'train_x.data', 
#                                    train_y_name = 'train_y.data')
# val_x, val_y = get_training_data(val_regions, output_folder_path,
#                                    max_len = 1000, 
#                                    train_x_name = 'val_x.data', 
#                                    train_y_name = 'val_y.data')
# # Pad for motif detectors
# train_x, val_x = pad_for_detector(train_x, 15), pad_for_detector(val_x, 15)

In [5]:
# data_x = pickle.load(open('../../../../temp/buffers/ss_samples/train_x.data', 'rb'))
# data_y = pickle.load(open('../../../../temp/buffers/ss_samples/train_y.data', 'rb'))
# train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 157)
# train_x, val_x = pad_for_detector(train_x, 10), pad_for_detector(val_x, 10)

In [6]:
train_x = pickle.load(open(os.path.join(output_folder_path, 'train_x.data'), 'rb'))
train_y = pickle.load(open(os.path.join(output_folder_path, 'train_y.data'), 'rb'))
val_x = pickle.load(open(os.path.join(output_folder_path, 'val_x.data'), 'rb'))
val_y = pickle.load(open(os.path.join(output_folder_path, 'val_y.data'), 'rb'))

In [7]:
n = 15 # number of filters
m = 30 # filter size
train_x, val_x = pad_for_detector(train_x, m), pad_for_detector(val_x, m)

In [8]:
train_x, val_x = torch.from_numpy(train_x).float(), torch.from_numpy(val_x).float()
train_y, val_y = torch.from_numpy(train_y).float(), torch.from_numpy(val_y).float()

# Convert data format from channel_last to channer_first
N, L, C = train_x.shape
n, l, _ = val_x.shape
train_x = train_x.reshape(N, C, L)
val_x = val_x.reshape(n, C, l)

# Generate dataset for data loader
train_dataset = data.TensorDataset(train_x, train_y)
val_dataset = data.TensorDataset(val_x, val_y)

In [9]:
# Reference for connecting Conv1D and LSTM: https://mxnet.incubator.apache.org/versions/master/tutorials/basic/reshape_transpose.html
class HybridNet(nn.Module):
    
    def __init__(self, pesudo_input, num_filters, filter_size, rnn_size, fc_out, dp1, dp2, 
                 num_rnn_layers=1, rnn_dropout=0):
        super(HybridNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=num_filters, kernel_size=filter_size)
        out = self.conv1(pesudo_input)
        out = nn.MaxPool1d(kernel_size=5, stride=5)(out)
        ####################################################
        print('shape after conv1d {}'.format(out.shape))
        N, C, T = out.shape
        out = torch.transpose(out, 1, 2)
        print('shape before lstm {}'.format(out.shape))
        ####################################################
        # or input_size*seq_len
        self.bi_lstm = nn.LSTM(input_size=C, hidden_size=rnn_size, num_layers=num_rnn_layers,
                              batch_first=True, dropout=rnn_dropout, bidirectional=True)
        out, _ = self.bi_lstm(out)
        print('shape after lstm {}'.format(out.shape))
        N, T, C = out.shape
        out = torch.transpose(out, 1, 2)
        out = out.reshape(N, -1)
        print('shape after flattening {}'.format(out.shape))
        self.fc1 = nn.Linear(T*C, fc_out, bias=True)
        self.fc2 = nn.Linear(fc_out, 1)
        self.p1 = dp1
        self.p2 = dp2
        
    def forward(self, seq):
        self.activation_seq = F.relu(self.conv1(seq))
        out = nn.MaxPool1d(kernel_size=5, stride=5)(self.activation_seq)
        out = nn.Dropout(p=self.p1)(out)
        
        #################################################################################
        # Input of LSTM layer should have shape (sequence_length, batch_size, input_size)
        #     - Sequence length here should be the length of activation after downsampling
        #     - Input size should be the number of filters
        #################################################################################
        N, C, T = out.shape
#         out = out.view(bs, 1, -1)
        out = torch.transpose(out, 1, 2)
        out, _ = self.bi_lstm(out)
        out = F.relu(out)
        
        #################################################################################
        # Need to flatten the sequence before feeding them into fully connected layer
        #################################################################################
        N, T, C = out.shape
        out = torch.transpose(out, 1, 2)
        out = out.reshape(N, -1)
        out = nn.Dropout(p=self.p2)(out)
        out = self.fc1(out)
        out = F.relu(out)
        out = self.fc2(out)
        out = torch.squeeze(out)
        return nn.Sigmoid()(out)

In [10]:
batch_size = 32
pesudo_input = torch.rand(batch_size, C, L, dtype=train_x.dtype)
hybrid_net = HybridNet(pesudo_input,
                       num_filters=n, 
                       filter_size=m, 
                       rnn_size=10, 
                       fc_out=20, 
                       dp1=0.6, dp2=0.7).to(device)

shape after conv1d torch.Size([32, 17016, 205])
shape before lstm torch.Size([32, 205, 17016])
shape after lstm torch.Size([32, 205, 20])
shape after flattening torch.Size([32, 4100])


In [11]:
def train(model, train_dataset, val_dataset, config):
    # Unpack config
    epochs = config['epochs']
    device = config['device']
    optimizer = config['opt']
    criterion = config['criterion']
    log_interval = config['log_interval']
    batch_size = config['batch_size']
    
    def get_acc(y_hat, y):
        y_pred = np.where(y_hat >=0.5, 1, 0)
        return np.mean(y_pred == y)
    
    # Generate data loaders
    val_loader = None
    train_loader = data.DataLoader(train_dataset, batch_size=batch_size)
    if val_dataset is not None:
        val_loader = data.DataLoader(val_dataset, batch_size=batch_size)
    total_train_steps = len(train_loader)
    total_val_steps = len(val_loader)
    
    train_loss_list, val_loss_list = [], []
    train_acc_list, val_acc_list = [], []
    print("Train on {} samples, validate on {} samples".format(len(train_dataset), len(val_dataset)))
    # Start training
    for epoch in range(1, epochs+1):
        train_loss_sum, train_acc_sum = 0, 0
        tic = time.time()
        for i, (batch, labels) in enumerate(train_loader):
            # Forward pass and calculating loss
            batch, labels = batch.to(device), labels.to(device)
            y_hat = model(batch)
            loss = criterion(y_hat, labels)
            
            # Backward pass and updating weights
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss_sum += loss.item()
            train_acc_sum += get_acc(y_hat.cpu().detach().numpy(), labels.cpu().detach().numpy())
        tac = time.time()
        avg_train_loss = train_loss_sum/total_train_steps
        avg_train_acc = train_acc_sum/total_train_steps
        train_loss_list.append(avg_train_loss)
        train_acc_list.append(avg_train_acc)
        print('***************************************')
        print('Epoch {}: training loss {}, training acc {}'.format(epoch, avg_train_loss, avg_train_acc))
        print('Time: {} \n'.format(tac-tic))
        
        # Validation
        if val_loader is not None:
            if epoch % log_interval == 0 or epoch == epochs:
                with torch.no_grad():
                    val_loss_sum, val_acc_sum = 0, 0
                    for j, (batch, labels) in enumerate(val_loader):
                        batch, labels = batch.to(device), labels.to(device)
                        y_hat = model(batch)
                        loss = criterion(y_hat, labels)
                        val_loss_sum += loss.item()
                        val_acc_sum += get_acc(y_hat.cpu().detach().numpy(), labels.cpu().detach().numpy())
                    avg_val_loss = val_loss_sum/total_val_steps
                    avg_val_acc = val_acc_sum/total_val_steps
                    val_loss_list.append(avg_val_loss)
                    val_acc_list.append(avg_val_acc)
                    print('[Validation loss {}, validation acc {}] \n'.format(avg_val_loss, avg_val_acc))
    return model, train_loss_list, val_loss_list, train_acc_list, val_acc_list

In [12]:
optimizers = {'adam': torch.optim.Adam(hybrid_net.parameters(), lr=1e-3),
              'rmsprop': torch.optim.RMSprop(hybrid_net.parameters(), lr=1e-3, weight_decay=1e-4)}
config = {'epochs':200, 'device':device, 
          'opt': optimizers['adam'],
          'criterion':nn.BCELoss(),
          'batch_size': batch_size,
          'log_interval':1}
C, L = train_x[0].shape

In [13]:
hybrid_net, train_loss_list, val_loss_list, train_acc_list, val_acc_list = train(hybrid_net, train_dataset, val_dataset, config)

Train on 68016 samples, validate on 17016 samples


KeyboardInterrupt: 