# German ULMFiT from scratch
The original (backbone of this) notebook is from the fastai-nlp course. Because of memory issues, the wiki text-files (from the get_wiki function) is (manually) split into manageable batches and loaded/learned on individually.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

### Language setup

In [2]:
lang = 'de'
name = f'{lang}wiki'

path = Path('data/dewiki')

### Download data

In [3]:
from nlputils import split_wiki,get_wiki

In [4]:
get_wiki(path,lang)

data/dewiki/dewiki already exists; not downloading


In [5]:
#! head -n4 {path}/{name}

In [6]:
dest = split_wiki(path,lang)

data/dewiki/docs already exists; not splitting


### Setup paths, hyperparams and vars

In [10]:
doc_path = path / 'docs'
mdl_path = path/'models'

path.mkdir(exist_ok=True, parents=True)
mdl_path.mkdir(exist_ok=True)

folders = 'A B C D E F G H I J K L M N O P Q R S1 S2 T U V W X Y Z char digit'.split()

lr = 1e-2
bs =  64
lr *= bs/48  # Scale learning rate by batch size

tmp_vocab = None
lm_fns = ['tmp','tmp_vocab']
done = []

In [None]:
for k,folder in enumerate(folders):
    # skip already used folders
    if folder in done:
        continue
        
    current_path = doc_path / folder
    print('Load DataBunch from: ', current_path)
    
    # load the (text) files from the current folder
    data = (TextList.from_folder(current_path, vocab = tmp_vocab)
            .split_by_rand_pct(0.1, seed = 42)
            .label_for_lm()           
            .databunch(bs = bs, num_workers = 0))
    
    # check if pretrained weights exist
    if (mdl_path / (lm_fns[0] + '.pth')).is_file() and (mdl_path / (lm_fns[1] + '.pkl')).is_file():
        print('Create Learner with pretrained weights')
        # create the learner with previously trained weights
        learn = language_model_learner(data, 
                                       AWD_LSTM, 
                                       drop_mult = 0.5,
                                       path = path,
                                       pretrained_fnames = lm_fns).to_fp16()
    else:
        # create the learner for first batch
        learn = language_model_learner(data, 
                                       AWD_LSTM, 
                                       drop_mult = 0.5, 
                                       pretrained = False).to_fp16()
    
    # learn on current batch
    learn.unfreeze()
    learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
    
    # save weights and vocab    
    print('Save LM learner at: ', current_path)
    learn.to_fp32().save(mdl_path.resolve() / lm_fns[0], with_opt=False)   
    
    tmp_vocab = learn.data.vocab
    tmp_vocab.save(mdl_path.resolve() / (lm_fns[1] + '.pkl'))
    
    done.append(folder)
    
    # backup
    if (k % 10) == 0:
        fn = 'backup_at_' + str(folder)
        learn.save(fn)
    
    # release GPU memory
    del(data)
    del(learn)
    torch.cuda.empty_cache()

In [14]:
def gradually_learning(folders, bs = 64, lr = 1e-2, bwd = False, sentence_piece = False):
    
    lr = lr
    bs =  bs
    lr *= bs/48  # Scale learning rate by batch size
    quotemark = ''

    
    tmp_vocab = None
    lm_fns = ['tmp','tmp_vocab']
    done = []
    
    if sentence_piece:
        proc = [OpenFileProcessor(), SPProcessor()] # processor for sentencepiece
        lm_fns = [fn + '_SP' for fn in lm_fns]
    else:
        proc = None
        
    if bwd:
        lm_fns = [fn + '_bwd' for fn in lm_fns]
    
    for k,folder in enumerate(folders):
        # skip already used folders
        if folder in done:
            continue
        folder = 'char'

        current_path = doc_path / folder
        print('Load DataBunch from: ', current_path)

        # load the (text) files from the current folder
        data = (TextList.from_folder(current_path, vocab = tmp_vocab, processor = proc)
                .split_by_rand_pct(0.1, seed = 42)
                .label_for_lm()           
                .databunch(bs = bs, num_workers = 0, backwards = bwd))

        # check if pretrained weights exist
        if (mdl_path / (lm_fns[0] + '.pth')).is_file() and (mdl_path / (lm_fns[1] + '.pkl')).is_file():
            print('Create Learner with pretrained weights')
            # create the learner with previously trained weights
            learn = language_model_learner(data, 
                                           AWD_LSTM, 
                                           drop_mult = 0.5,
                                           path = path,
                                           pretrained_fnames = lm_fns).to_fp16()
        else:
            # create the learner for first batch
            learn = language_model_learner(data, 
                                           AWD_LSTM, 
                                           drop_mult = 0.5, 
                                           pretrained = False).to_fp16()

        # learn on current batch
        print('Start learning')
        learn.unfreeze()
        learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
        
        break
        # save weights and vocab    
        print('Save LM learner')
        learn.to_fp32().save(mdl_path.resolve() / lm_fns[0], with_opt=False)   

        tmp_vocab = learn.data.vocab
        tmp_vocab.save(mdl_path.resolve() / (lm_fns[1] + '.pkl'))

        done.append(folder)

        # backup
        if (k % 10) == 0:
            fn = 'backup_at_' + str(folder)
            learn.save(fn)

        # release GPU memory
        del(data)
        del(learn)
        torch.cuda.empty_cache()

In [16]:
gradually_learning(folders, bs = 32, bwd = True, sentence_piece = True)

Load DataBunch from:  data/dewiki/docs/char


Start learning


epoch,train_loss,valid_loss,accuracy,time
0,4.28254,4.127473,0.398712,03:54
