# Vietnamese ULMFiT from scratch

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

ModuleNotFoundError: No module named 'fastai'

In [4]:
#bs=48
bs=24
#bs=128

In [3]:
torch.cuda.set_device(0)

In [5]:
#data_path = Config.data_path()


This will create a `viwiki` folder, containing a `viwiki` text file with the wikipedia contents. (For other languages, replace `vi` with the appropriate code from the [list of wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).)

In [6]:
lang = 'de'

In [7]:
name = f'{lang}wiki'
path = Path('data/dewiki')
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

## Vietnamese wikipedia model

### Download data

In [8]:
from nlputils import split_wiki,get_wiki

In [None]:
get_wiki(path,lang)

In [None]:
! head -n4 {path}/{name}

This function splits the single wikipedia file into a separate file per article. This is often easier to work with.

In [9]:
dest = split_wiki(path,lang)

data/dewiki/docs already exists; not splitting


In [17]:
dest.ls()[:5]

[PosixPath('data/dewiki/docs/Telebit.txt'),
 PosixPath('data/dewiki/docs/Sakaran Dandai.txt'),
 PosixPath('data/dewiki/docs/Bilateraler Wechselkurs.txt'),
 PosixPath('data/dewiki/docs/Joyce Compton.txt'),
 PosixPath('data/dewiki/docs/David Fabricius.txt')]

927396

In [None]:
# Use this to convert Chinese traditional to simplified characters
# ls *.txt | parallel -I% opencc -i % -o ../zhsdocs/% -c t2s.json

### Create pretrained model

In [22]:
data = (TextList.from_folder('data/dewiki/docs')
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=8, num_workers=1))

#data.save(f'{lang}_databunch')
#len(data.vocab.itos),len(data.train_ds)

KeyboardInterrupt: 

## <font color = red>From now on start from here </font>##

In [20]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

#bs=48
bs=24
#bs=128

#data_path = Config.data_path()
path = Path('data/dewiki')
lang = 'de'

# lang = 'zh'
name = f'{lang}wiki'
#path = data_path/name

path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

In [None]:
torch.cuda.set_device(0)

In [None]:
data = load_data(path, f'{lang}_databunch', bs=bs)

In [None]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [None]:
lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

Save the pretrained model and vocab:

In [None]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

## Vietnamese sentiment analysis

### Language model

- [Data](https://github.com/ngxbac/aivivn_phanloaisacthaibinhluan/tree/master/data)
- [Competition details](https://www.aivivn.com/contests/1)
- Top 3 f1 scores: 0.900, 0.897, 0.897

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'
train_df.head()

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df.head()

In [None]:
df = pd.concat([train_df,test_df], sort=False)

In [None]:
data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1))

In [None]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0)

In [None]:
lr = 1e-3
lr *= bs/48

In [None]:
learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

In [None]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

In [None]:
learn_lm.save(f'{lang}fine_tuned')
learn_lm.save_encoder(f'{lang}fine_tuned_enc')

### Classifier

In [None]:
data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1))

data_clas.save(f'{lang}_textlist_class')

In [None]:
data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)

In [None]:
from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [None]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc')
learn_c.freeze()

In [None]:
lr=2e-2
lr *= bs/48

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

In [None]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

In [None]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

In [None]:
learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

In [None]:
learn_c.save(f'{lang}clas')

Competition top 3 f1 scores: 0.90, 0.89, 0.89. Winner used an ensemble of 4 models: TextCNN, VDCNN, HARNN, and SARNN.

## Ensemble

In [None]:
data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load(f'{lang}clas', purge=False);

In [None]:
preds,targs = learn_c.get_preds(ordered=True)
accuracy(preds,targs),f1(preds,targs)

In [None]:
data_clas_bwd = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)
learn_c_bwd = text_classifier_learner(data_clas_bwd, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c_bwd.load(f'{lang}clas_bwd', purge=False);

In [None]:
preds_b,targs_b = learn_c_bwd.get_preds(ordered=True)
accuracy(preds_b,targs_b),f1(preds_b,targs_b)

In [None]:
preds_avg = (preds+preds_b)/2

In [None]:
accuracy(preds_avg,targs_b),f1(preds_avg,targs_b)