# SENTIMENT ANALYSIS USING ROMANIAN BERT

In [None]:
!pip3 install transformers tokenizers pytorch-lightning torch

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import *
import logging
import os
from functools import lru_cache
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import pytorch_lightning as pl
import pandas as pd
from sklearn.metrics import classification_report
from training_module import TrainingModule
import matplotlib.pyplot as plt
from argparse import Namespace
import matplotlib
matplotlib.style.use('ggplot')

### BERT MODEL ###

In [2]:
name = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(name)
config = BertConfig.from_pretrained(name, output_hidden_states=True)
bert_model = AutoModel.from_pretrained(name, config=config)

hparams = Namespace(
    batch_size=16,
    warmup_steps=100,
    epochs=1,
    lr=5e-4,
    accumulate_grad_batches=1,
)

INFO:transformers.tokenization_utils:Model name 'dumitrescustefan/bert-base-romanian-cased-v1' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming 'dumitrescustefan/bert-base-romanian-cased-v1' is a path or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file dumitrescustefan/bert-base-romanian-cased-v1/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file dumitrescustefan/bert-base-romanian-cased-v1

### TRAINING MODULE###

In [3]:
module = TrainingModule(bert_model=bert_model, tokenizer=tokenizer, hparams=hparams)

### TRAINING ###

The trainer is where the magic happens. We can feed the TrainingModule to the trainer `fit` method and can be very easily switch from CPU to GPU.

In [4]:
## train roughly for about 10-15 minutes with GPU enabled.
trainer = pl.Trainer(gpus=0, max_epochs=hparams.epochs, progress_bar_refresh_rate=10,
                     accumulate_grad_batches=hparams.accumulate_grad_batches)

trainer.fit(module)

INFO:lightning:GPU available: False, used: False
INFO:lightning:
    | Name                                                    | Type              | Params
------------------------------------------------------------------------------------------
0   | model                                                   | SentimentModel    | 125 M 
1   | model.model                                             | BertModel         | 124 M 
2   | model.model.embeddings                                  | BertEmbeddings    | 38 M  
3   | model.model.embeddings.word_embeddings                  | Embedding         | 38 M  
4   | model.model.embeddings.position_embeddings              | Embedding         | 393 K 
5   | model.model.embeddings.token_type_embeddings            | Embedding         | 1 K   
6   | model.model.embeddings.LayerNorm                        | LayerNorm         | 1 K   
7   | model.model.embeddings.dropout                          | Dropout           | 0     
8   | model.model.encoder

HBox(children=(IntProgress(value=1, bar_style='info', description='Validation sanity check', layout=Layout(fle…

{'valid_loss': tensor(0.8596), 'log': {'valid_loss': tensor(0.8596)}, 'progress_bar': {'valid_loss': tensor(0.8596)}}
{'valid_loss': tensor(0.8689), 'log': {'valid_loss': tensor(0.8689)}, 'progress_bar': {'valid_loss': tensor(0.8689)}}


KeyboardInterrupt: 

### TESTING ###

In [None]:
trainer.test()

In [65]:
with torch.no_grad():
    progress = ["/", "-", "\\", "|", "/", "-", "\\", "|"]
    module.eval()
    true_y, pred_y = [], []
    for i, batch_ in enumerate(module.test_dataloader()):
        X, y = batch_
        batch = X
        y_pred = torch.argmax(module(batch), dim=1)
        true_y.extend(y)
        pred_y.extend(y_pred)
print("\n" + "_" * 80)
print(classification_report(true_y, pred_y, target_names=["Results"], digits=2))


________________________________________________________________________________
              precision    recall  f1-score   support

     Results       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



### PLOT RESULTS###

In [None]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/