# German ULMFiT from scratch
The original (backbone of this) notebook is from the fastai-nlp course (see https://github.com/fastai/course-nlp/). Because of memory issues, the wiki text-files (from the get_wiki function) is (manually) split into manageable batches and loaded/learned on individually.

The intention is to build several Language Models for German, in different configurations:
* forwards or backwards
* with or without subword tokenization

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

### Language setup

In [3]:
lang = 'de'
name = f'{lang}wiki'

path = Path('data/dewiki')

### Download data

In [4]:
from nlputils import split_wiki,get_wiki

In [5]:
get_wiki(path,lang)

data/dewiki/dewiki already exists; not downloading


In [6]:
#! head -n4 {path}/{name}

In [7]:
dest = split_wiki(path,lang)

data/dewiki/docs already exists; not splitting


### Setup paths, hyperparams and vars

In [8]:
doc_path = path / 'docs'
mdl_path = path/'models'

path.mkdir(exist_ok=True, parents=True)
mdl_path.mkdir(exist_ok=True)

folders = 'char digit A B C D E F G H I J K L M N O P Q R S1 S2 T U V W X Y Z'.split()

lr = 1e-2
bs =  64
lr *= bs/48  # Scale learning rate by batch size

tmp_vocab = None
lm_fns = ['tmp','tmp_vocab']
done = []

In [None]:
for k,folder in enumerate(folders):
    # skip already used folders
    if folder in done:
        continue
        
    current_path = doc_path / folder
    print('Load DataBunch from: ', current_path)
    
    # load the (text) files from the current folder
    data = (TextList.from_folder(current_path, vocab = tmp_vocab)
            .split_by_rand_pct(0.1, seed = 42)
            .label_for_lm()           
            .databunch(bs = bs, num_workers = 0))
    
    # check if pretrained weights exist
    if (mdl_path / (lm_fns[0] + '.pth')).is_file() and (mdl_path / (lm_fns[1] + '.pkl')).is_file():
        print('Create Learner with pretrained weights')
        # create the learner with previously trained weights
        learn = language_model_learner(data, 
                                       AWD_LSTM, 
                                       drop_mult = 0.5,
                                       path = path,
                                       pretrained_fnames = lm_fns).to_fp16()
    else:
        # create the learner for first batch
        learn = language_model_learner(data, 
                                       AWD_LSTM, 
                                       drop_mult = 0.5, 
                                       pretrained = False).to_fp16()
    
    # learn on current batch
    learn.unfreeze()
    learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
    
    # save weights and vocab    
    print('Save LM learner at: ', current_path)
    learn.to_fp32().save(mdl_path.resolve() / lm_fns[0], with_opt=False)   
    
    tmp_vocab = learn.data.vocab
    tmp_vocab.save(mdl_path.resolve() / (lm_fns[1] + '.pkl'))
    
    done.append(folder)
    
    # backup
    if (k % 10) == 0:
        fn = 'backup_at_' + str(folder)
        learn.save(fn)
    
    # release GPU memory
    del(data)
    del(learn)
    torch.cuda.empty_cache()

In [9]:
def gradually_learning(folders, bs = 64, lr = 1e-2, bwd = False, sentence_piece = False):
    
    lr = lr
    bs =  bs
    lr *= bs/48  # Scale learning rate by batch size
    
    tmp_vocab = None
    lm_fns = ['tmp','tmp_vocab']
    done = []
    
    if sentence_piece:
        proc = [OpenFileProcessor(), SPProcessor()] # processor for sentencepiece
        lm_fns = [fn + '_SP' for fn in lm_fns]
    else:
        proc = None
        
    if bwd:
        lm_fns = [fn + '_bwd' for fn in lm_fns]
    
    for k,folder in enumerate(folders):
        # skip already used folders
        if folder in done:
            continue

        current_path = doc_path / folder
        print('Load DataBunch from: ', current_path)

        # load the (text) files from the current folder
        data = (TextList.from_folder(current_path, vocab = tmp_vocab, processor = proc)
                .split_by_rand_pct(0.1, seed = 42)
                .label_for_lm()           
                .databunch(bs = bs, num_workers = 0, backwards = bwd))

        # check if pretrained weights exist
        if (mdl_path / (lm_fns[0] + '.pth')).is_file() and (mdl_path / (lm_fns[1] + '.pkl')).is_file():
            print('Create Learner with pretrained weights')
            # create the learner with previously trained weights
            learn = language_model_learner(data, 
                                           AWD_LSTM, 
                                           drop_mult = 0.5,
                                           path = path,
                                           pretrained_fnames = lm_fns).to_fp16()
        else:
            # create the learner for first batch
            learn = language_model_learner(data, 
                                           AWD_LSTM, 
                                           drop_mult = 0.5, 
                                           pretrained = False).to_fp16()

        # learn on current batch
        print('Start learning')
        learn.unfreeze()
        learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
        
        # save weights and vocab    
        print('Save LM learner')
        learn.to_fp32().save(mdl_path.resolve() / lm_fns[0], with_opt=False)   

        tmp_vocab = learn.data.vocab
        tmp_vocab.save(mdl_path.resolve() / (lm_fns[1] + '.pkl'))

        done.append(folder)

        # backup
        if (k % 10) == 0:
            fn = 'backup_at_' + str(folder)
            learn.save(fn)

        # release GPU memory
        del(data)
        del(learn)
        torch.cuda.empty_cache()
        

In [11]:
gradually_learning(folders, bs = 32, bwd = True, sentence_piece = False)

Load DataBunch from:  data/dewiki/docs/char


Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.865428,3.474728,0.478004,04:20


Save LM learner
Load DataBunch from:  data/dewiki/docs/digit


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.218706,3.139839,0.491409,03:30


Save LM learner
Load DataBunch from:  data/dewiki/docs/A


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.132964,3.059079,0.510387,44:56


Save LM learner
Load DataBunch from:  data/dewiki/docs/B


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.070975,2.982961,0.517626,42:57


Save LM learner
Load DataBunch from:  data/dewiki/docs/C


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.050684,2.930524,0.523047,26:18


Save LM learner
Load DataBunch from:  data/dewiki/docs/D


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.166981,3.080536,0.505965,36:35


Save LM learner
Load DataBunch from:  data/dewiki/docs/E


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.067108,3.004277,0.513135,27:51


Save LM learner
Load DataBunch from:  data/dewiki/docs/F


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.099745,2.985244,0.515643,30:39


Save LM learner
Load DataBunch from:  data/dewiki/docs/G


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.066911,2.99432,0.515897,37:15


Save LM learner
Load DataBunch from:  data/dewiki/docs/H


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.020906,2.913008,0.523958,35:39


Save LM learner
Load DataBunch from:  data/dewiki/docs/I


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.152833,3.058972,0.50512,12:54


Save LM learner
Load DataBunch from:  data/dewiki/docs/J


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,2.921971,2.807352,0.533587,25:57


Save LM learner
Load DataBunch from:  data/dewiki/docs/K


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.061054,2.949326,0.519183,36:06


Save LM learner
Load DataBunch from:  data/dewiki/docs/L


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,2.976074,2.961319,0.521858,28:24


Save LM learner
Load DataBunch from:  data/dewiki/docs/M


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.018152,2.950745,0.521538,41:44


Save LM learner
Load DataBunch from:  data/dewiki/docs/N


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.093136,2.984341,0.516477,17:21


Save LM learner
Load DataBunch from:  data/dewiki/docs/O


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,2.954084,2.920177,0.525311,13:57


Save LM learner
Load DataBunch from:  data/dewiki/docs/P


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.12422,2.993168,0.515387,31:37


Save LM learner
Load DataBunch from:  data/dewiki/docs/Q


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.310189,3.213539,0.493798,01:22


Save LM learner
Load DataBunch from:  data/dewiki/docs/R


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time
0,3.016785,2.948112,0.521043,28:38


Save LM learner
Load DataBunch from:  data/dewiki/docs/S1


Create Learner with pretrained weights
Start learning


epoch,train_loss,valid_loss,accuracy,time


KeyboardInterrupt: 

In [11]:
done  = 'char digit A B C D E F G H I J K L M N O P Q R'.split()
to_do = [folder for folder in folders if folder not in done]
to_do

['S1', 'S2', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [None]:
gradually_learning(to_do, bs = 32, bwd = True, sentence_piece = False)