# German ULMFiT from scratch
The original (backbone of this) notebook is from the fastai-nlp course (see https://github.com/fastai/course-nlp/). Because of memory issues, the wiki text-files (from the get_wiki function) is (manually) split into manageable batches and loaded/learned on individually.

The intention is to build several Language Models for German, in different configurations:
* forwards or backwards
* with or without subword tokenization

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

### Language setup

In [2]:
lang = 'de'
name = f'{lang}wiki'

path = Path('data/dewiki')

### Download data

In [7]:
#from nlputils import split_wiki,get_wiki

#get_wiki(path,lang)
#! head -n4 {path}/{name}
#dest = split_wiki(path,lang)

### Setup paths, hyperparams and vars

In [8]:
doc_path = path / 'docs'
mdl_path = path / 'models'

path.mkdir(exist_ok=True, parents=True)
mdl_path.mkdir(exist_ok=True)

folders = 'char digit A B C D E F G H I J K L M N O P Q R S1 S2 T U V W X Y Z'.split()

lr = 1e-2
bs =  64
lr *= bs/48  # Scale learning rate by batch size

tmp_vocab = None
lm_fns = ['tmp','tmp_vocab']
done = []

In [9]:
#for k,folder in enumerate(folders):
#    # skip already used folders
#    if folder in done:
#        continue
#        
#    current_path = doc_path / folder
#    print('Load DataBunch from: ', current_path)
#    
#    # load the (text) files from the current folder
#    data = (TextList.from_folder(current_path, vocab = tmp_vocab)
#            .split_by_rand_pct(0.1, seed = 42)
#            .label_for_lm()           
#            .databunch(bs = bs, num_workers = 0))
#    
#    # check if pretrained weights exist
#    if (mdl_path / (lm_fns[0] + '.pth')).is_file() and (mdl_path / (lm_fns[1] + '.pkl')).is_file():
#        print('Create Learner with pretrained weights')
#        # create the learner with previously trained weights
#        learn = language_model_learner(data, 
#                                       AWD_LSTM, 
#                                       drop_mult = 0.5,
#                                       path = path,
#                                       pretrained_fnames = lm_fns).to_fp16()
#    else:
#        # create the learner for first batch
#        learn = language_model_learner(data, 
#                                       AWD_LSTM, 
#                                       drop_mult = 0.5, 
#                                       pretrained = False).to_fp16()
#    
#    # learn on current batch
#    learn.unfreeze()
#    learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
#    
#    # save weights and vocab    
#    print('Save LM learner at: ', current_path)
#    learn.to_fp32().save(mdl_path.resolve() / lm_fns[0], with_opt=False)   
#    
#    tmp_vocab = learn.data.vocab
#    tmp_vocab.save(mdl_path.resolve() / (lm_fns[1] + '.pkl'))
#    
#    done.append(folder)
#    
#    # backup
#    if (k % 10) == 0:
#        fn = 'backup_at_' + str(folder)
#        learn.save(fn)
#    
#    # release GPU memory
#    del(data)
#    del(learn)
#    torch.cuda.empty_cache()

In [97]:
def gradually_learning(folders, bs = 64, lr = 1e-2, bwd = False, sentence_piece = False):
    
    lr = lr
    bs =  bs
    lr *= bs/48  # Scale learning rate by batch size
    
    tmp_vocab = None
    lm_fns = ['tmp','tmp_vocab']
    done = []
    
    if sentence_piece:
        proc = [OpenFileProcessor(), SPProcessor()] # processor for sentencepiece
        lm_fns = [fn + '_SP' for fn in lm_fns]
    else:
        proc = None
        
    if bwd:
        lm_fns = [fn + '_bwd' for fn in lm_fns]
    
    for k,folder in enumerate(folders):
        # skip already used folders
        if folder in done:
            continue

        current_path = doc_path / folder
        print('Load DataBunch from: ', current_path)

        # load the (text) files from the current folder
        data = (TextList.from_folder(current_path, vocab = tmp_vocab, processor = proc)
                .split_by_rand_pct(0.1, seed = 42)
                .label_for_lm()           
                .databunch(bs = bs, num_workers = 0, backwards = bwd))

        # check if pretrained weights exist
        if (mdl_path / (lm_fns[0] + '.pth')).is_file() and (mdl_path / (lm_fns[1] + '.pkl')).is_file():
            print('Create Learner with pretrained weights')
            # create the learner with previously trained weights
            learn = language_model_learner(data, 
                                           AWD_LSTM,
                                           drop_mult = 0.5,
                                           path = path,
                                           pretrained_fnames = lm_fns).to_fp16()
        else:
            # create the learner for first batch
            learn = language_model_learner(data, 
                                           AWD_LSTM, 
                                           drop_mult = 0.5, 
                                           pretrained = False).to_fp16()

        # learn on current batch
        print('Start learning')
        learn.unfreeze()
        learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
        
        # save weights and vocab    
        print('Save LM learner')
        learn.to_fp32().save(mdl_path.resolve() / lm_fns[0], with_opt=False)   

        tmp_vocab = learn.data.vocab
        tmp_vocab.save(mdl_path.resolve() / (lm_fns[1] + '.pkl'))

        done.append(folder)

        # backup
        if (k % 10) == 0:
            fn = 'backup_at_' + str(folder)
            learn.save(fn)

        # release GPU memory
        del(data)
        del(learn)
        torch.cuda.empty_cache()
        

In [101]:
#gradually_learning(folders, bs = 32, bwd = True, sentence_piece = True)

Take a look at the model

In [103]:
vocab = Vocab(pickle.load( open(mdl_path / 'de_fwd_sent_vocab.pkl', "rb" )))
proc = [OpenFileProcessor(), SPProcessor()]

In [104]:
data = (TextList.from_folder(doc_path / 'char', vocab = vocab, processor = proc)
        .split_by_rand_pct(0.1, seed = 42)
        .label_for_lm()           
        .databunch(bs = bs, num_workers = 0))

In [105]:
data.show_batch()

idx,text
0,"ha ▁in ▁der ▁xxmaj ▁ li schau er ▁xxmaj ▁ schwelle . ▁xxmaj ▁am ▁nordöstlichen ▁xxmaj ▁ortsausgang ▁erstreckt ▁sich ▁der ▁xxmaj ▁teich ▁xxmaj ▁žim ut ický ▁rybník ▁mit ▁der ▁xxmaj ▁mühle ▁xxmaj ▁žim ut ický ▁mlýn ▁unter halb ▁des ▁xxmaj ▁ damm es , ▁südlich ▁liegen ▁der ▁xxmaj ▁far ský ▁rybník ▁und ▁der ▁xxmaj ▁m n ich ovec . ▁xxmaj ▁im ▁xxmaj ▁osten ▁erhebt ▁sich ▁der ▁xxmaj ▁so bě t"
1,"▁xxmaj ▁dabei ▁wurde ▁er ▁1965 ▁und ▁1967 ▁mit ▁der ▁xxmaj ▁mannschaft ▁xxmaj ▁fünfter . ▁xxmaj ▁bei ▁der ▁xxmaj ▁europa meisterschaft ▁1966 ▁erreichte ▁er ▁im ▁xxmaj ▁einzel ▁nach ▁xxmaj ▁siegen ▁über ▁xxmaj ▁ist van ▁xxmaj ▁kor pa ▁( ju go slaw ien ), ▁xxmaj ▁felix ▁xxmaj ▁ fel ten ▁( l ux em burg ), ▁xxmaj ▁pen ti i ▁xxmaj ▁tu o minen ▁( fin n land ), ▁xxmaj ▁stuart ▁xxmaj"
2,"▁leben ▁des ▁xxmaj ▁volkes ▁zu ▁richten ▁habe , ▁würden ▁erneut ▁zur ▁xxmaj ▁ disposition ▁gestellt . ▁xxmaj ▁aufgrund ▁des ▁xxmaj ▁todes ▁xxmaj ▁johannes ▁xxmaj ▁paul s ▁xxup ▁i . , ▁welcher ▁der ▁xxmaj ▁konferenz ▁hätte ▁vor stehen ▁sollen , ▁wurde ▁sie ▁vom ▁xxmaj ▁oktober ▁auf ▁den ▁xxmaj ▁januar ▁1979 ▁verschoben ▁und ▁von ▁xxmaj ▁johannes ▁xxmaj ▁paul ▁xxup ▁ii . ▁eröffnet . ▁xxmaj ▁rom ero ▁nutzte ▁diese ▁xxmaj ▁zeit , ▁um"
3,"▁xxmaj ▁ukraine ▁zu ▁verein igen . ▁xxmaj ▁die ▁xxmaj ▁synode ▁der ▁russisch - orthodoxen ▁xxmaj ▁kirche ▁erklärte ▁daraufhin ▁am ▁15. ▁xxmaj ▁oktober ▁2018 , ▁einseitig ▁die ▁xxmaj ▁ gottesdienst gemeinschaft ▁mit ▁dem ▁xxmaj ▁ökumenische n ▁xxmaj ▁patriarchat ▁abzu brechen . ▁xxmaj ▁zum ▁xxmaj ▁ökumenische n ▁xxmaj ▁patriarchat ▁von ▁xxmaj ▁konstantinopel ▁gehören ▁sechs ▁xxmaj ▁er z diözese n , ▁18 ▁weitere ▁xxmaj ▁metropoli en ▁und ▁acht ▁xxmaj ▁teil kirchen ▁auf ▁allen"
4,"▁kann ▁während ▁der ▁xxmaj ▁öffnung s zeiten ▁des ▁xxmaj ▁museums ▁besichtigt ▁werden . ▁< ▁/ ▁doc > ▁xxbos ▁xxmaj ▁ ḫumbaba ▁xxmaj ▁ ḫumbaba ▁( sprich ▁xxmaj ▁ chum baba ), ▁früh ▁auch ▁xxmaj ▁hu wa wa , ▁ist ▁in ▁der ▁sumerisch en ▁xxmaj ▁mythologie ▁der ▁xxmaj ▁ wächter ▁des ▁xxmaj ▁ ze der n wald es ▁am ▁xxmaj ▁libanon . ▁xxmaj ▁er ▁kommt ▁in ▁mehreren ▁sumerisch en ▁xxmaj ▁my"


In [59]:
lm_fns = ['de_fwd_spacy','de_fwd_spacy_vocab']

learn_lm = language_model_learner(data, 
                               AWD_LSTM, 
                               drop_mult = 0.5,
                               path = path,
                               pretrained_fnames = lm_fns).to_fp16()

In [21]:
learn_lm.predict('Mein Kind', 200, 1.)

'Mein Kind Split meint op . 06 : „ Das Fehlen von Schönheit und Lügen der nationalen Kolonialmacht belegten sie energisch über viele Jahre geschlossen an . “ \n \n  Jesus : klassischer Vertreter des Videos ist sein halber eigener Kontrast . Er entschied , dass es sich nicht um eine fehlende kompakte Frau handelt , sondern Evelyn , wovon Insgesamt Sebastian Heinrich an die Box erinnert . Entgegen den jeweiligen Mitteln der Formen gilt Ignaz weil auch Elizabeth Buch , die durch ein Lied mit Büchern den Urteil verabschiedet hat . Süd ist hierbei die Fragen eines dünnen Zyklus auf die Menschheit nach Anlehnung an beiden Grenzen und des zahllosen Marokko - enthalten , woraufhin sich die erste Fassung an ihrem " Verlag " von Fritsch entwickelt ( u. a. " Unser Sekunden des Herrn " von Ferdinand Fischer ) . Eine zweite Version der Fernsehserie beinhaltet allerdings eine neue , einzigartige Aufführung basierend auf ihrer'

In [86]:
learn_lm.model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60000, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60000, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60000, bias=True)
    (output_dp): RNNDropout()
  )
)