In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from lookahead import Lookahead
from models_n_training import train
import torch
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import time
import math
from utilities import sampling, one_hot_encoding, curtail, get_training_data, load_data, data_split, dianostic_plots, pad_for_detector
from transformer import*
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
if use_cuda:
    print(torch.cuda.get_device_name(0))

Tesla K80


In [3]:
train_x = pickle.load(open('/home/ubuntu/data/dmel_seq/train_tk_x_ex.np', 'rb'))
train_y = pickle.load(open('/home/ubuntu/data/dmel_seq/train_tk_y_ex.np', 'rb'))
val_x = pickle.load(open('/home/ubuntu/data/dmel_seq/val_tk_x_ex.np', 'rb'))
val_y = pickle.load(open('/home/ubuntu/data/dmel_seq/val_tk_y_ex.np', 'rb'))

In [4]:
train_x, val_x = torch.from_numpy(train_x), torch.from_numpy(val_x)
train_y, val_y = torch.from_numpy(train_y), torch.from_numpy(val_y)

# Generate dataset for data loader
train_dataset = data.TensorDataset(train_x, train_y)
val_dataset = data.TensorDataset(val_x, val_y)
train_x.shape, train_y.shape, val_x.shape, val_y.shape

(torch.Size([11408, 3388]),
 torch.Size([11408]),
 torch.Size([2854, 3388]),
 torch.Size([2854]))

In [5]:
# reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html
# class PositionalEncoding(nn.Module):
#     "Implement the PE function."
#     def __init__(self, d_model, dropout, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)
        
#         # Compute the positional encodings once in log space.
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# #         div_term = torch.exp(torch.arange(0, d_model, 2) *
# #                              -(math.log(10000.0) / d_model))
#         div_term = 1 / (10000 ** (torch.arange(0., d_model, 2) / d_model)) 
#         pe[:, 0::2] = torch.sin(position * div_term).cuda()
#         pe[:, 1::2] = torch.cos(position * div_term).cuda()
#         pe = pe.unsqueeze(0)
#         self.register_buffer('pe', pe)
        
#     def forward(self, x):
#         x = x + torch.Tensor(self.pe[:, :x.size(1)]).cuda()
#         return self.dropout(x)

# https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, dropout, max_seq_len = 5000):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        pe.requires_grad=False
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        self.pe = pe.unsqueeze(0)[:, :max_seq_len].cuda()
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + self.pe
        return x

In [6]:
# Self-attention encoder net
class SAENet(nn.Module):
    
    def __init__(self, d_model, vocab, nhead, num_layers, fc_dim, dropout=0.1):
        super(SAENet, self).__init__()
        
        encoder_layer = TransformerEncoderLayer(d_model, nhead)
        self.encoder = TransformerEncoder(encoder_layer, num_layers)
        self.src_embed = nn.Embedding(vocab, d_model)
        self.position = PositionalEncoder(d_model, dropout, max_seq_len=3388)
        self.fc1 = nn.Linear(d_model, fc_dim)
        self.fc2 = nn.Linear(fc_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, seq):
        seq = self.src_embed(seq)
        seq = self.position(seq)
        seq = self.encoder(seq)
        out = self.fc2(self.dropout(F.relu(self.fc1(seq))))
        return out

In [7]:
class TransformerClassifier(nn.Module):
    def __init__(self, d_model, vocab, nhead, num_layers, fc_dim, dropout=0.5):
        super(TransformerClassifier, self).__init__()
        self.encoder = SAENet(d_model, vocab, nhead, num_layers, fc_dim)
        self.fc = nn.Linear(d_model, 1)
        self.head = nn.Linear(3388, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, seq):
        seq = self.encoder(seq)
        seq = torch.squeeze(self.dropout(F.relu(self.fc(seq))))
        seq = torch.squeeze(self.head(seq))
        return nn.Sigmoid()(seq)

In [8]:
tclassifier = TransformerClassifier(d_model=100, vocab=5, nhead=5, num_layers=1, fc_dim=150).to(device)

In [9]:
train_loader = data.DataLoader(train_dataset, batch_size=64)
with torch.no_grad():
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        print(tclassifier(batch).shape)
        break

torch.Size([64])


In [10]:
base_opt = torch.optim.Adam(tclassifier.parameters(), lr=1e-4, weight_decay=1e-5, betas=(0.9, 0.999)) # Any optimizer
lookahead = Lookahead(base_opt, k=5, alpha=0.5) # Initialize Lookahead
optimizers = {'adam': torch.optim.Adam(tclassifier.parameters(), lr=1e-3, weight_decay=1e-4),
              'rmsprop': torch.optim.RMSprop(tclassifier.parameters(), lr=1e-3, weight_decay=1e-4), 
              'lookahead': lookahead}

config = {'epochs':500, 'device':device, 
          'opt': optimizers['lookahead'],
          'criterion':nn.BCELoss(),
          'batch_size': 64,
          'log_interval':1}

In [11]:
train(tclassifier, train_dataset, val_dataset, config)

Train on 11408 samples, validate on 2854 samples
***************************************
Epoch 1: training loss 0.69268559976663, training acc 0.5334322625698324
Time: 185.32201981544495 

[Validation loss 0.6926648179690044, validation acc 0.5322368421052631] 



KeyboardInterrupt: 