# Finetuning SMSA
Originally from: https://github.com/indobenchmark/indonlu/blob/master/examples/finetune_smsa.ipynb

*dengan beberapa penyesuaian dan tambahan

In [None]:
# install transformers dari Hugging Face
!pip install -q transformers

[K     |████████████████████████████████| 1.1MB 2.7MB/s 
[K     |████████████████████████████████| 3.0MB 8.6MB/s 
[K     |████████████████████████████████| 890kB 29.7MB/s 
[K     |████████████████████████████████| 1.1MB 32.5MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
# clone repository indonlu, karena ada beberapa utils yang digunakan
!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 252, done.[K
remote: Counting objects: 100% (252/252), done.[K
remote: Compressing objects: 100% (158/158), done.[K
remote: Total 252 (delta 104), reused 235 (delta 94), pack-reused 0[K
Receiving objects: 100% (252/252), 4.77 MiB | 2.26 MiB/s, done.
Resolving deltas: 100% (104/104), done.


In [None]:
import os, sys
# sys.path.insert(0,'/home/karissa/indonlu/')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from nltk.tokenize import TweetTokenizer, word_tokenize

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

## Helper functions

In [None]:
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [None]:
# Set random seed
set_seed(3102020)

## Load model

In [None]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229167.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1534.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=497810400.0, style=ProgressStyle(descri…




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# tokenizer
text = "Budi suka bermain bola."

encoding = tokenizer.encode(text)
decoding = tokenizer.decode(encoding)
encoding_input = tokenizer(text)

print(encoding)
print(decoding)
print(encoding_input)

[2, 5103, 1506, 1326, 1522, 30470, 3]
[CLS] budi suka bermain bola. [SEP]
{'input_ids': [2, 5103, 1506, 1326, 1522, 30470, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
# model config
config

BertConfig {
  "_num_labels": 5,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 50000
}

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
count_param(model)

124443651

## Prepare Dataset

In [None]:
train_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_dataset.data.shape, valid_dataset.data.shape, test_dataset.data.shape

((11000, 2), (1260, 2), (500, 2))

In [None]:
train_dataset.data.head()

Unnamed: 0,text,sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,0
1,mohon ulama lurus dan k212 mmbri hujjah partai...,1
2,lokasi strategis di jalan sumatera bandung . t...,0
3,betapa bahagia nya diri ini saat unboxing pake...,0
4,duh . jadi mahasiswa jangan sombong dong . kas...,2


In [None]:
# label index
train_dataset.LABEL2INDEX

{'negative': 2, 'neutral': 1, 'positive': 0}

In [None]:
train_dataset.data.sentiment.value_counts()

0    6416
2    3436
1    1148
Name: sentiment, dtype: int64

In [None]:
valid_dataset.data.sentiment.value_counts()

0    735
2    394
1    131
Name: sentiment, dtype: int64

In [None]:
# Data Loader
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)

In [None]:
len(train_loader), len(valid_loader), len(test_loader)

(688, 79, 32)

In [None]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Fine Tuning & Evaluation

In [None]:
optimizer = optim.Adam(model.parameters(), lr=5e-6)
model = model.cuda()

In [None]:
# Train
n_epochs = 3
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.2629 LR:0.00000500: 100%|██████████| 688/688 [01:46<00:00,  6.44it/s]
  0%|          | 0/79 [00:00<?, ?it/s]

(Epoch 1) TRAIN LOSS:0.2629 ACC:0.90 F1:0.87 REC:0.86 PRE:0.89 LR:0.00000500


VALID LOSS:0.2072 ACC:0.92 F1:0.89 REC:0.88 PRE:0.91: 100%|██████████| 79/79 [00:05<00:00, 15.14it/s]
  0%|          | 0/688 [00:00<?, ?it/s]

(Epoch 1) VALID LOSS:0.2072 ACC:0.92 F1:0.89 REC:0.88 PRE:0.91


(Epoch 2) TRAIN LOSS:0.1294 LR:0.00000500: 100%|██████████| 688/688 [01:46<00:00,  6.44it/s]
  0%|          | 0/79 [00:00<?, ?it/s]

(Epoch 2) TRAIN LOSS:0.1294 ACC:0.96 F1:0.94 REC:0.94 PRE:0.95 LR:0.00000500


VALID LOSS:0.2184 ACC:0.92 F1:0.89 REC:0.89 PRE:0.91: 100%|██████████| 79/79 [00:05<00:00, 15.15it/s]
  0%|          | 0/688 [00:00<?, ?it/s]

(Epoch 2) VALID LOSS:0.2184 ACC:0.92 F1:0.89 REC:0.89 PRE:0.91


(Epoch 3) TRAIN LOSS:0.0820 LR:0.00000500: 100%|██████████| 688/688 [01:46<00:00,  6.44it/s]
  0%|          | 0/79 [00:00<?, ?it/s]

(Epoch 3) TRAIN LOSS:0.0820 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97 LR:0.00000500


VALID LOSS:0.1856 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 79/79 [00:05<00:00, 15.25it/s]

(Epoch 3) VALID LOSS:0.1856 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92





In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('prediction.csv', index=False)

100%|██████████| 32/32 [00:01<00:00, 16.64it/s]


In [None]:
df.head()

Unnamed: 0,index,label
0,0,negative
1,1,negative
2,2,negative
3,3,negative
4,4,negative


In [None]:
df.label.value_counts()

positive    218
negative    213
neutral      69
Name: label, dtype: int64