In [0]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

This is a program to do sentiment analysis of IMDB dataset using fastai librrary with Pytorch as the backend. The data can downloaded from here http://files.fast.ai/data/aclImdb.tgz 

In [0]:
PATH='data/aclImdb/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

In [0]:
review = !cat {TRN}{trn_files[6]}
review[0]   

In [0]:
!find {TRN} -name '*.txt' | xargs cat | wc -w

This will tell how many words are for training

In [0]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

for analysis, we would require spacy tokenizer  to tokenize it from array of words to array of token

In [0]:
spacy_imdb = spacy.load('en')
imdb_review=[sent.string.strip() for sent in spacy_imdb(review[0])]

In [0]:
' '.join(imdb_review)

In [0]:
TEXT = data.Field(lower=True, tokenize="spacy")

Using Pytorch's torchtext to preprocess the data and telling it to use spacy as the tokenizer 

In [0]:
bs=64
bptt=70

bs: Batch size 
bptt: No of words are processing at a time in each row of the mini-batch

In [0]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [0]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

In [0]:
next(iter(md.trn_dl))

**Train**

In [0]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [0]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

Using fastai's built-in model for LSTM language Model

In [0]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [0]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

In [0]:
learner.save_encoder('adam1_enc')

In [0]:
learner.load_encoder('adam3_10_enc')

In [0]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

**Test**

In [0]:
m=learner.model
ss=""".  The best thrilling movie"""
s = [TEXT.preprocess(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

In [0]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [0]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [0]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

**Sentiment**

In [0]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [0]:
IMDB_LABEL = data.Field(sequential=False)
splits = torchtext.datasets.IMDB.splits(TEXT, IMDB_LABEL, 'data/')

In [0]:
t = splits[0].examples[0]

In [0]:
t.label, ' '.join(t.text[:16])

In [0]:
md2 = TextData.from_splits(PATH, splits, bs)

In [0]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'adam3_10_enc')

In [0]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [0]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

In [0]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

In [0]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

In [0]:
accuracy_np(*m3.predict_with_targs())