# ULMFiT Notebook

This notebook assumes that you have finished finetuning the language model using the LM training scripts.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import numpy as np
import pandas as pd
from tqdm import tqdm

from finetuning import one_cycle
from utils import produce_dataloaders, count_parameters, drop_mult, get_param_groups
from layers import AWDLSTMEncoder, ConcatPoolingDecoder, RNNClassifier
#from transformers import WarmupLinearSchedule

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
np.random.seed(42)
torch.manual_seed(42);
torch.cuda.manual_seed(42);
torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/imdb_finetuned_part.pth /content

In [None]:
!cp /content/drive/MyDrive/cache.pth /content

In [None]:
from google.colab import files 
upload = files.upload()
!mkdir -p ~/.kaggle && mv kaggle.json ~/.kaggle/

Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d jcblaise/imdb-sentiments

Downloading imdb-sentiments.zip to /content
 95% 35.0M/36.9M [00:00<00:00, 69.5MB/s]
100% 36.9M/36.9M [00:00<00:00, 104MB/s] 


In [None]:
!unzip imdb-sentiments.zip

Archive:  imdb-sentiments.zip
  inflating: test.csv                
  inflating: test.txt                
  inflating: train.csv               
  inflating: train.txt               
  inflating: valid.txt               


We load the dataset and split them into training and validation sets.

In [None]:
df = pd.read_csv('train.csv').sample(frac=1, random_state=42)
text, sentiment = list(df['text']), list(df['sentiment'])

tr_sz = int(len(text) * 0.7)

X_train, y_train = text[:tr_sz], sentiment[:tr_sz]
X_val, y_val = text[tr_sz:], sentiment[tr_sz:]

We just need to tokenize our dataset. We use spacy for this.

In [None]:
import spacy
en = spacy.load('en')

def tokenize(t):
    return [str(token) for token in en(t)]

The next line will take a while.  We'll save it so we can just load the tokenized data in the future.

In [None]:
X_train = [tokenize(t) for t in tqdm(X_train)]
X_val = [tokenize(t) for t in tqdm(X_val)]



100%|██████████| 17500/17500 [13:22<00:00, 21.81it/s]
 99%|█████████▉| 7419/7500 [05:37<00:03, 22.32it/s]

In [None]:
with open('cache.pth', 'wb') as f:
   torch.save([X_train, X_val], f)

In [None]:
!kaggle datasets download -d abee82/fastai-wikitext-wt103-pretrained-model

Downloading fastai-wikitext-wt103-pretrained-model.zip to /content
 96% 198M/206M [00:01<00:00, 121MB/s]
100% 206M/206M [00:01<00:00, 129MB/s]


In [None]:
!unzip fastai-wikitext-wt103-pretrained-model.zip


Archive:  fastai-wikitext-wt103-pretrained-model.zip
  inflating: wt103-bwd/itos_wt103.pkl  
  inflating: wt103-bwd/lstm_bwd.pth  
  inflating: wt103-fwd/itos_wt103.pkl  
  inflating: wt103-fwd/lstm_fwd.pth  


Load the data.

In [None]:
with open('cache.pth', 'rb') as f:
    X_train, X_val = torch.load(f)

In [None]:
import pickle


with open('/content/wt103-fwd/itos_wt103.pkl', 'rb') as f:
    mylist = pickle.load(f)

In [None]:
word2idx = {}
idx2word = {}

In [None]:
word2idx = {x: i for i, x in enumerate(mylist)}

In [None]:
idx2word = dict(map(reversed, word2idx.items()))

In [None]:
word2idx['<unk>']=len(mylist)
idx2word[len(mylist)]='<unk>'
idx2word[len(mylist)+1]='<pad>'
word2idx['<pad>']=len(mylist)+1

We'll delimit the data to a maximum sequence length and pad shorter sequences. We also opt to drop the last batch which has an irregular batch size.

In this step, we load the vocabulary of the finetuned language model.

In [None]:
msl = 512
bs = 64

# Load the vocabulary
# with open('/content/wt103-fwd/itos_wt103.pkl', 'rb') as f:
#     word2idx, idx2word = torch.load(f)
vocab_set = set(idx2word)

# Produce dataloaders
train_loader, val_loader = produce_dataloaders(X_train, y_train, X_val, y_val, 
                                               word2idx, vocab_set, msl, bs, drop_last=True)

100%|██████████| 17500/17500 [00:02<00:00, 5935.39it/s]
100%|██████████| 7500/7500 [00:01<00:00, 5907.51it/s]


In [None]:
torch.save((word2idx,idx2word), 'vocab_obj.pth')

In [None]:
len(idx2word)

60002

In [None]:
class args:
  path = '/content'
  train='train.txt' 
  valid='valid.txt' 
  test='test.txt' 
  output='imdb_finetuned_part' 
  bs=70 
  eval_bs = 10


  bptt=70 
  use_var_bptt = False
  rebuild_dataset = False

  epochs=10 
  
  load_vocab = True
  vocab_file='vocab_obj.pth'
  save_vocab = True
  encoder = 'awd_lstm'
  decoder = 'dropoutLinear'
  emb_dim = 400
  hidden_dim = 1152
  num_layers = 3
  emb_dp = 0.1
  hidden_dp = 0.3
  input_dp = 0.3
  weight_dp = 0.5
  out_dp = 0.4
  initrange = 0.05
  tie_weights = False
  dm = 1
  use_pretrained = True 
  epochs = 2
  clip = 0.25
  alpha = 2.0
  beta = 1.0
  anneal_factor = 4
  no_warmup = False
  warmup_pct = 0.1
  disc_rate = 1.0

  no_cuda = False

  pretrained_file='wt103-fwd/lstm_fwd.pth' 
  freeze_encoder = True
  optimizer= 'adam' 
  no_lr_scaling =  True
  lr=1e-2 
  gpu=0
  save_graphs = True
  seed = 42



We construct the model and load the pretrained weights, scaling the dropout rates.

In [None]:
encoder = AWDLSTMEncoder(vocab_sz=len(idx2word), emb_dim=args.emb_dim, hidden_dim=args.hidden_dim, 
                             num_layers=args.num_layers, emb_dp=args.emb_dp, weight_dp=args.weight_dp, 
                             input_dp=args.input_dp, hidden_dp=args.hidden_dp, tie_weights=args.tie_weights)

In [None]:
# encoder = AWDLSTMEncoder(vocab_sz=len(idx2word), emb_dim=400, hidden_dim=1152, num_layers=3, tie_weights = False)
decoder = ConcatPoolingDecoder(hidden_dim=1152, bneck_dim=50, out_dim=2)
model = RNNClassifier(encoder, decoder).to(device)

# Load weights
with open('/content/imdb_finetuned_part.pth', 'rb') as f:
    inc = model.load_state_dict(torch.load(f), strict=False)
    
# Scale dropout
model = drop_mult(model, dm=0.5)

We set the parameter groups for discriminative learning rates. We set up an optimizer with a default learning rate.

In [None]:
criterion = nn.CrossEntropyLoss()
p_groups = get_param_groups(model)
optimizer = optim.Adam(p_groups, lr=5e-3)

Then we set up the scheduling. Should we want to use linear warmups, we can supply it. If no scheduler is supplied to the ```one_cycle``` function, it uses Cyclic Learning Rates like in the paper.

In [None]:
scheduler = None
use_linear_warmup = False

if use_linear_warmup:
    epochs = 5
    steps = len(train_loader) * epochs
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(steps * 0.1), t_total=steps)

Freeze the model.

In [None]:
model.freeze()

And gradually unfreeze while finetuning.

```lr_decrease``` refeers to how much the learning rate is decreased for lower layers in discriminative learning rates. In the ```one_cycle``` function, if the scheduler is set to ```None```, then it uses Cyclic Learning Rate scheduling, rising from 0 to the ```lr``` supplied to the function ```stlr_warmup``` percent of steps (default 0.1).

In [None]:
model

RNNClassifier(
  (encoder): AWDLSTMEncoder(
    (embeddings): Embedding(60002, 400, padding_idx=1)
    (emb_dp): EmbeddingDropout(
      (emb): Embedding(60002, 400, padding_idx=1)
    )
    (rnn): ModuleList(
      (0): LSTM(400, 1152)
      (1): LSTM(1152, 1152)
      (2): LSTM(1152, 1152)
    )
    (weight_dp): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 1152)
      )
    )
    (hidden_dp): RNNDropout()
    (input_dp): RNNDropout()
  )
  (decoder): ConcatPoolingDecoder(
    (bn1): BatchNorm1d(3456, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linear1): Linear(in_features=3456, out_features=50, bias=True)
    (linear2): Linear(in_features=50, out_features=2, bias=True)
    (dropout_pool): Dropout(p=0.2, inplace=Fa

In [None]:
model.unfreeze(-1)

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=1, lr=1e-2)

100%|██████████| 273/273 [05:39<00:00,  1.24s/it, lr0=4.2e-5, lr1=4.2e-5]
100%|██████████| 117/117 [02:22<00:00,  1.22s/it]

Train Loss: 0.7031 | Train Acc: 0.4964 | Val Loss: 15.9128 | Val Acc: 0.5036





In [None]:
model.unfreeze(-2)

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=1e-2)

  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad


Train Loss: 0.6969 | Train Acc: 0.5048 | Val Loss: 0.7044 | Val Acc: 0.5043





In [None]:
model.unfreeze(-3)

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=5e-3)

  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad


Train Loss: 0.6949 | Train Acc: 0.5002 | Val Loss: 0.7124 | Val Acc: 0.4931





In [None]:
model.unfreeze_all()

one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=1e-3)

  return self._grad
  return self._grad
  return self._grad
  return self._grad
  return self._grad
  2%|▏         | 5/273 [00:21<19:04,  4.27s/it, lr0=0.000114, lr1=4.4e-5]


KeyboardInterrupt: ignored

In [None]:
one_cycle(model, criterion, optimizer, train_loader, val_loader, scheduler=scheduler, device=device, lr_decrease=2.6, lr=1e-3)