In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from sentimentanalyser.utils.data  import Path, pad_collate, grandparent_splitter
from sentimentanalyser.utils.data  import parent_labeler, listify
from sentimentanalyser.data.text   import TextList, ItemList, SplitData
from sentimentanalyser.utils.files import pickle_dump, pickle_load

from sentimentanalyser.preprocessing.processor import TokenizerProcessor, NuemericalizeProcessor
from sentimentanalyser.preprocessing.processor import CategoryProcessor

In [3]:
from functools import partial

In [4]:
path_imdb = Path("/home/anukoolpurohit/Documents/AnukoolPurohit/Datasets/imdb")

In [5]:
proc_tok = TokenizerProcessor()
proc_num = NuemericalizeProcessor()
proc_cat = CategoryProcessor()

In [6]:
tl_imdb = TextList.from_files(path=path_imdb, folders=['train','test'])
sd_imdb = tl_imdb.split_by_func(partial(grandparent_splitter, valid_name='test'))
ll_imdb = sd_imdb.label_by_func(parent_labeler, proc_x=[proc_tok, proc_num], proc_y=proc_cat)

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
pickle_dump(ll_imdb, 'dumps/variable/ll_imdb.pickle')

In [8]:
ll_imdb = pickle_load('dumps/variable/ll_imdb.pickle')

In [9]:
imdb_data = ll_imdb.clas_databunchify(64)

In [12]:
from sentimentanalyser.utils.callbacks import sched_cos, combine_scheds
from sentimentanalyser.callbacks.training import LR_Find, CudaCallback
from sentimentanalyser.callbacks.progress import ProgressCallback
from sentimentanalyser.callbacks.scheduler import ParamScheduler
from sentimentanalyser.callbacks.stats import AvgStatsCallback
from sentimentanalyser.callbacks.recorder import Recorder
from sentimentanalyser.training.trainer import Trainer

In [13]:
from sentimentanalyser.utils.metrics import accuracy
from sentimentanalyser.callbacks.core import Callback

In [14]:
import torch
import torchtext
import torch.nn.functional as F
from torch import nn

#### some helper functions and variables

In [15]:
def get_lens_and_masks(x, pad_id=1):
    mask = (x == pad_id)
    lenghts = x.size(1) - (x == pad_id).sum(1)
    return lenghts, mask

In [16]:
def print_dims(name, tensor):
    print(f'size of {name} is {tensor.shape}')

In [17]:
x1, y1 = next(iter(imdb_data.train_dl))

## Models

### Simple LSTM model

In [20]:
class SimpleLSTMModel(nn.Module):
    def __init__(self, vocab_size=proc_num.vocab_size, embedding_size=50,
                 hidden_size=50, output_size=2, dropout_rate=0.5,
                 pad_idx=1, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size,padding_idx=pad_idx)
        self.lstm      = nn.LSTM(hidden_size, hidden_size,
                                 batch_first=True)
        self.dropout   = nn.Dropout(dropout_rate)
        self.linear    = nn.Linear(hidden_size, output_size)
    
    def forward(self, xb):
        seq_lens, mask = get_lens_and_masks(xb)
        
        embedded = self.embedding(xb)
        embedded = self.dropout(embedded)
        
        packed_emb = nn.utils.rnn.pack_padded_sequence(embedded, seq_lens,
                                                       batch_first=True)
        
        packed_out,(hidden_state,cell_state) = self.lstm(packed_emb)
        out = self.linear(hidden_state.squeeze(0))
        return out

### Bidirectional multi-layered LSTM

In [21]:
class WordSentenceAttention(nn.Module):
    def __init__(self, hidden_sz):
        super().__init__()
        self.hidden_sz = hidden_sz
        self.context_weight     = nn.Parameter(torch.Tensor(hidden_sz).uniform_(-0.1,0.1))
        self.context_projection = nn.Linear(hidden_sz, hidden_sz)
    
    def forward(self, context):
        context_proj = torch.tanh(self.context_projection(context))
        αt = context_proj.matmul(self.context_weight) # bs x seq_len
        attn_score = F.softmax(αt, dim=1) # softmax on all words
        attn_score = attn_score.unsqueeze(2)
        context = context.transpose(1,2)
        sentence = context.bmm(attn_score).squeeze(2)
        return sentence

In [22]:
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_sz=proc_num.vocab_size, embed_sz= 50,
                 hidden_sz=50, output_sz=2, dropout=0.5, pad_idx=1,
                 num_layers=2, bidirectional=True):
        super().__init__()
        
        self.pad_idx = pad_idx
        self.embededing = nn.Embedding(vocab_sz, embed_sz, padding_idx=pad_idx)
        self.dropout    = nn.Dropout(dropout)
        
        self.rnn        = nn.LSTM(embed_sz, hidden_sz, batch_first=True,
                                  bidirectional=bidirectional,
                                  dropout=dropout,
                                  num_layers=num_layers)
        
        self.bidir  = 2 if bidirectional else 1
        
        self.attn   = WordSentenceAttention(hidden_sz*self.bidir)
        
        self.linear = nn.Linear(self.bidir*hidden_sz, output_sz)
        return
    
    def forward(self, xb):
        seq_lens, mask = get_lens_and_masks(xb, self.pad_idx)
        
        embeded = self.embededing(xb)
        packed_embd = nn.utils.rnn.pack_padded_sequence(embeded, seq_lens,
                                                        batch_first=True)
        packed_out, (hidden_s, cell_s) = self.rnn(packed_embd)
        outputs,_  = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        sentence = self.attn(outputs)
        return self.linear(sentence)

## Testing

In [23]:
model = BiLSTMModel()

In [32]:
a = nn.Embedding(50000, 100)

In [33]:
a(x1).shape

torch.Size([64, 3352, 100])

## Training

### Gradient Clipping

In [None]:
class GradientClipping(Callback):
    def __init__(self, clip=None):
        self.clip = clip
    
    def after_backward(self):
        if self.clip:
            nn.utils.clip_grad_norm_(self.trainer.model.parameters(), self.clip)

In [None]:
def get_basic(Model, num_layers=1):
    model = Model(num_layers=num_layers)
    loss_func = nn.CrossEntropyLoss()
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    return model, loss_func, opt

In [None]:
sched = combine_scheds([0.3, 0.7], [sched_cos(3e-3, 1e-2), sched_cos(1e-2, 3e-5)])

In [None]:
trainer = Trainer(imdb_data, *get_basic(BiLSTMModel), cb_funcs=[
    partial(AvgStatsCallback, [accuracy]),
    partial(ParamScheduler,'lr', sched),
    partial(GradientClipping, clip=0.1),
    ProgressCallback,
    CudaCallback,
    Recorder
])

In [None]:
trainer.fit()

In [None]:
proc_num.vocab