<a href="https://colab.research.google.com/github/NazarioR9/BNBR_Challenge/blob/master/mask_language_modeling/MLM_BertBase_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# References

In [None]:
# How to Fine-Tune BERT for Text Classification? (https://arxiv.org/pdf/1905.05583.pdf)
# https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

# Installs

In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 675kB 2.8MB/s 
[K     |████████████████████████████████| 890kB 12.3MB/s 
[K     |████████████████████████████████| 1.1MB 16.6MB/s 
[K     |████████████████████████████████| 3.8MB 26.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
# Check that we have a GPU
!nvidia-smi

Fri Jun 26 18:39:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Imports

In [None]:
import os, sys, gc

In [None]:
import pandas as pd

In [None]:
import torch
from transformers import BertConfig, BertTokenizerFast, BertForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

In [None]:
from transformers import pipeline

In [None]:
sys.path.insert(0, '../bnbr')

In [None]:
from bnbr.utils import seed_everything
from bnbr.data import MentalHealthDataset

Using TensorFlow backend.


# Envs

In [None]:
seed_everything(2020)

Set seed to 2020.


In [None]:
!mkdir ../mlm_finetuned_models

# Utils

In [None]:
class MLMDataset(MentalHealthDataset):
  def __init__(self, df, tokenizer):
    super(MLMDataset, self).__init__(df, task='test')
    self.tokenizer = tokenizer

  def __getitem__(self, idx):
    text, _, _ = super().__getitem__(idx)
    enc = self.tokenizer.encode(text)
    return torch.tensor(enc)

#Read 

In [None]:
path = '../data/'

In [None]:
train = pd.read_csv(path+'final_train.csv')
test = pd.read_csv(path+'final_test.csv')

In [None]:
train['length'] = 0
test['length'] = 0

In [None]:
df = pd.concat([train, test])

In [None]:
df.head()

Unnamed: 0,ID,text,label,Depression,Alcohol,Suicide,Drugs,length
0,SUAVK39Z,i feel that it was better i die am happy,0.0,1.0,0.0,0.0,0.0,0
1,9JDAGUV3,why do i get hallucinations ?,3.0,0.0,0.0,0.0,1.0,0
2,419WR1LQ,i am stressed due to lack of financial suppor...,0.0,1.0,0.0,0.0,0.0,0
3,6UY7DX6Q,why is life important ?,2.0,0.0,0.0,1.0,0.0,0
4,FYC0FTFB,how could i be helped to go through the depre...,0.0,1.0,0.0,0.0,0.0,0


#Finetuning MLM

In [None]:
model_name = 'bert-base-uncased'

In [None]:
config = BertConfig.from_pretrained(model_name)
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length =50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
model = BertForMaskedLM.from_pretrained(model_name)
# model = BertForMaskedLM(config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
dataset = MLMDataset(df[['text', 'length']], tokenizer)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
training_args = TrainingArguments(
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

### Start training

In [None]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=15.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=15.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=15.0, style=ProgressStyle(description_wid…



CPU times: user 8.98 s, sys: 2.9 s, total: 11.9 s
Wall time: 12.2 s


TrainOutput(global_step=45, training_loss=2.5907892730500963)

#### 🎉 Save final model

In [None]:
!mkdir fms

In [None]:
trainer.save_model("fms/")

In [None]:
!zip -r mlm_bert_base_.zip fms/

  adding: mlm_finetuned/ (stored 0%)
  adding: mlm_finetuned/pytorch_model.bin (deflated 8%)
  adding: mlm_finetuned/config.json (deflated 47%)
  adding: mlm_finetuned/training_args.bin (deflated 36%)


In [None]:
!rm -r fms/