In [None]:
from local.imports import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.core import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

## Data

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line.replace('<unk>', UNK)
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article.split(' '))
            current_article = ''
    articles.append(current_article.split(' '))
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
trn_txt = read_file(path/'train.txt')
val_txt = read_file(path/'valid.txt')

In [None]:
count = Counter([p for t in trn_txt for p in t])
vocab = make_vocab(count)

In [None]:
splits = [list(range(len(val_txt), len(val_txt)+len(trn_txt))), list(range(len(val_txt)))]
tfm = Numericalize(make_vocab(count))

In [None]:
dsrc = DataSource(val_txt+trn_txt, [tfm], filts=splits)

In [None]:
bs,sl = 104,72
train_dl = LMDataLoader(dsrc.train, bs=bs,   seq_len=sl, after_batch=[Cuda()], shuffle=True)
valid_dl = LMDataLoader(dsrc.valid, bs=2*bs, seq_len=sl, after_batch=[Cuda()])

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

Unnamed: 0,text
0,"\n = Zygoballus sexpunctatus = \n \n Zygoballus sexpunctatus is a species of jumping spider which occurs in the southeastern United States where it can be found in a variety of grassy habitats . Adult spiders measure between 3 and 4 @.@ 5 mm in length . The cephalothorax and abdomen are bronze to black in color , with reddish brown or yellowish legs . The male has distinctive enlarged chelicerae"
1,"leading up to the airing of "" August "" in another interview , Roberto Orci elaborated that "" xxunk will be one of the things that they will be xxunk struggling with , actually . That was a fun one , because that one was one where you 're finally getting to pay off things you 've been setting up for a year . You finally get to open the toy box"
2,"would not be returning as Cobra Commander in the sequel . \n In June 2011 , Dwayne Johnson was cast as Roadblock , D.J. Cotrona and RZA were cast as Flint and Blind Master respectively , while Élodie Yung was in talks for the role of Jinx . In July 2011 , Adrianne Palicki was confirmed for the lead female role of Lady Jaye , and Ray Stevenson was confirmed to portray"
3,"you forever if you do things like this . "" Taylor believed the HNC to be an essential part of the government , because as an American , he believed civilian legitimacy was a must . For him , the HNC was a necessary step in a progression towards an elected civilian legislature , which he regarded as critical for national and military morale . The historian Mark xxunk regarded Taylor 's"
4,"promising in many respects , than any the present age has been called upon to encourage . We have not found it to be quite all that we wished in this xxunk it would have been very extraordinary if we had , for our wishes went far beyond reasonable expectations . But we have found it of a nature to present to common xxunk the poetical power with which the author 's"
5,". \n \n = = = Water quality = = = \n \n The clear @-@ cutting of forests in the 19th century adversely affected the ecology of the Plunketts Creek watershed and its water quality . xxunk industries on the creek and its tributaries then included a coal mine and tannery ( which are long since departed ) . In the autumn of 1897 , three men working with hides at"
6,"Carter , Andrew Goldberg and Elaine xxunk . It is an hour @-@ long special with three musical numbers . Ron MacFarlane , Seth MacFarlane 's father , served as the episode 's narrator . This is also the first "" Road to "" episode to be composed by Ron Jones . \n Two of the musical numbers , "" All I Really Want for Christmas "" and "" Christmastime is Killing"
7,"xxunk ' notes , low pitched and harsh , occurring at low and high levels of intensity . The narrow @-@ band call is used in situations where the bird signals the presence of a predator and xxunk information about its own location , while the broad @-@ band alarm is used to attract attention , and can initiate mobbing behaviour . These xxunk calls vary between individuals , and laboratory tests"
8,"Rosebery also began to befriend those politicians such as Lord xxunk who xxunk with her husband , while others such as Lord Granville and Lord Hartington she identified as aloof . She dismissed Lord Spencer with "" I can never look on him as a great motive power , besides he does not mention Archie [ Rosebery ] to me . "" This was the same Lord Spencer who had advised the"
9,"singles chart , and charted for a total of thirteen weeks . \n Throughout xxunk and Europe , the song peaked outside the top @-@ twenty in most countries . "" Loverboy "" debuted at its peak position of number seven on the Australian Singles Chart , during the week of July 29 , 2001 . The following week , the song began its decline , and experienced a total chart trajectory"


## Model

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=3, beta=2)]

In [None]:
learn = Learner(model, dbch, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,7.238688,6.339022,0.135431,566.24231,00:59
