# Finetuning Exam Question Classification - IndoBERT


## Preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 45.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [3]:
pip install torch



In [4]:
pip install tqdm



In [5]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

#from tools.forward_fn import forward_sequence_classification
#from tools.metrics import document_sentiment_metrics_fn
#from tools.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [6]:
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
            
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])
        
    return loss, list_hyp, list_label

In [7]:
#!/usr/bin/env python

# Python version of the evaluation script from CoNLL'00-

# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported

import sys
import re
from collections import defaultdict, namedtuple

Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')

class EvalCounts(object):
    def __init__(self):
        self.correct_chunk = 0    # number of correctly identified chunks
        self.correct_tags = 0     # number of correct chunk tags
        self.found_correct = 0    # number of chunks in corpus
        self.found_guessed = 0    # number of identified chunks
        self.token_counter = 0    # token counter (ignores sentence breaks)

        # counts by type
        self.t_correct_chunk = defaultdict(int)
        self.t_found_correct = defaultdict(int)
        self.t_found_guessed = defaultdict(int)

###
# Evaluate Function
###        
def parse_tag(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, '')

def start_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk started between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'S': chunk_start = True

    if prev_tag == 'E' and tag == 'E': chunk_start = True
    if prev_tag == 'E' and tag == 'I': chunk_start = True
    if prev_tag == 'S' and tag == 'E': chunk_start = True
    if prev_tag == 'S' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'E': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    # these chunks are assumed to have length 1
    if tag == '[': chunk_start = True
    if tag == ']': chunk_start = True

    return chunk_start

def end_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk ended between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_end = False

    if prev_tag == 'E': chunk_end = True
    if prev_tag == 'S': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'S': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'S': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    # these chunks are assumed to have length 1
    if prev_tag == ']': chunk_end = True
    if prev_tag == '[': chunk_end = True

    return chunk_end

def evaluate_fn(guessed, correct, last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts):
    guessed, guessed_type = parse_tag(guessed)
    correct, correct_type = parse_tag(correct)

    end_correct = end_of_chunk(last_correct, correct,
                               last_correct_type, correct_type)
    end_guessed = end_of_chunk(last_guessed, guessed,
                               last_guessed_type, guessed_type)
    start_correct = start_of_chunk(last_correct, correct,
                                   last_correct_type, correct_type)
    start_guessed = start_of_chunk(last_guessed, guessed,
                                   last_guessed_type, guessed_type)

    if in_correct:
        if (end_correct and end_guessed and
            last_guessed_type == last_correct_type):
            in_correct = False
            counts.correct_chunk += 1
            counts.t_correct_chunk[last_correct_type] += 1
        elif (end_correct != end_guessed or guessed_type != correct_type):
            in_correct = False

    if start_correct and start_guessed and guessed_type == correct_type:
        in_correct = True

    if start_correct:
        counts.found_correct += 1
        counts.t_found_correct[correct_type] += 1
    if start_guessed:
        counts.found_guessed += 1
        counts.t_found_guessed[guessed_type] += 1
    if correct == guessed and guessed_type == correct_type:
        counts.correct_tags += 1
    counts.token_counter += 1

    last_guessed = guessed
    last_correct = correct
    last_guessed_type = guessed_type
    last_correct_type = correct_type
    
    return last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts
    
def evaluate(hyps_list, labels_list):
    counts = EvalCounts()
    num_features = None       # number of features per line
    in_correct = False        # currently processed chunks is correct until now
    last_correct = 'O'        # previous chunk tag in corpus
    last_correct_type = ''    # type of previously identified chunk tag
    last_guessed = 'O'        # previously identified chunk tag
    last_guessed_type = ''    # type of previous chunk tag in corpus

    for hyps, labels in zip(hyps_list, labels_list):
        for hyp, label in zip(hyps, labels):
            step_result = evaluate_fn(hyp, label, last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts)
            last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts = step_result
        # Boundary between sentence
        step_result = evaluate_fn('O', 'O', last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts)
        last_correct, last_correct_type, last_guessed, last_guessed_type, in_correct, counts = step_result
        
    if in_correct:
        counts.correct_chunk += 1
        counts.t_correct_chunk[last_correct_type] += 1

    return counts

###
# Calculate Metrics Function
###
def uniq(iterable):
    seen = set()
    return [i for i in iterable if not (i in seen or seen.add(i))]

def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed-correct, total-correct
    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
    f = 0 if p + r == 0 else (2 * p * r) / (p + r)
    return Metrics(tp, fp, fn, p, r, f)

def metrics(counts):
    c = counts
    overall = calculate_metrics(
        c.correct_chunk, c.found_guessed, c.found_correct
    )
    by_type = {}
    for t in uniq(list(c.t_found_correct.keys()) + list(c.t_found_guessed.keys())):
        by_type[t] = calculate_metrics(
            c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
        )
    return overall, by_type
    return overall

###
# Main Function
###
def conll_evaluation(hyps_list, labels_list):
    counts = evaluate(hyps_list, labels_list)
    overall, by_type = metrics(counts)

    c = counts
    acc = c.correct_tags / c.token_counter
    pre = overall.prec
    rec = overall.rec
    f1 = overall.fscore
    
    type_macro_pre = 0.0
    type_macro_rec = 0.0
    type_macro_f1 = 0.0
    for k in by_type.keys():
        type_macro_pre += by_type[k].prec
        type_macro_rec += by_type[k].rec
        type_macro_f1 += by_type[k].fscore
        
    type_macro_pre = type_macro_pre / float(len(by_type))
    type_macro_rec = type_macro_rec / float(len(by_type))
    type_macro_f1 = type_macro_f1 / float(len(by_type))
    
    return (acc, pre, rec, f1, type_macro_pre, type_macro_rec, type_macro_f1)


In [8]:
import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def emotion_detection_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

def aspect_extraction_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

def ner_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

def pos_tag_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

def entailment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

def keyword_extraction_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

def qa_factoid_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

def absa_metrics_fn(list_hyp, list_label):
    # hyp and label are both list (multi label), flatten the list
    list_hyp = list(itertools.chain.from_iterable(list_hyp))
    list_label = list(itertools.chain.from_iterable(list_label))
    
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

def news_categorization_metrics_fn(list_hyp, list_label):
    # hyp and label are both list (multi label), flatten the list
    list_hyp = list(itertools.chain.from_iterable(list_hyp))
    list_label = list(itertools.chain.from_iterable(list_label))
    
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

In [9]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [10]:
# Set random seed
set_seed(26092020)

## Load Model

In [11]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = 3 #DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Downloading:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
count_param(model)

124443651

## Prepare Dataset

In [14]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("/content/drive/MyDrive/Thesis [Full Script Python & Dataset]/IndoBERT/IndoBERT - Final/dataset.csv", sep=",", header=0, index_col=False, engine='python')
dataset = dataset.dropna()
train, test = train_test_split(dataset,test_size=0.15, random_state=42)
train2, valid = train_test_split(train, test_size=0.15, random_state=42)

train2['text'] = train2['jenis']+' '+train2['pertanyaan']
test['text'] = test['jenis']+' '+test['pertanyaan']
valid['text'] = valid['jenis']+' '+valid['pertanyaan']

train2 = train2[['text','kategori']]
test = test[['text','kategori']]
valid = valid[['text','kategori']]

train2.to_csv('train.csv', index=False, encoding='utf-8')
test.to_csv('test.csv', index=False, encoding='utf-8')
valid.to_csv('valid.csv', index=False, encoding='utf-8')

print('Dataset Ready !')

Dataset Ready !


In [15]:
class EQClassificationDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'Mudah': 0, 'Sedang': 1, 'Sulit': 2}
    INDEX2LABEL = {0: 'Mudah', 1: 'Sedang', 2: 'Sulit'}
    NUM_LABELS = 3
    
    def load_dataset(self, path): 
        df = pd.read_csv(path, sep=',', header=0)
        df.columns = ['text','kategori']
        df['kategori'] = df['kategori'].apply(lambda lab: self.LABEL2INDEX[lab])
        return df
    
    def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        text, sentiment = data['text'], data['kategori']
        subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
        return np.array(subwords), np.array(sentiment), data['text']
    
    def __len__(self):
        return len(self.data)    
        
class EQClassificationDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(EQClassificationDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
        
        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment
            
            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, sentiment_batch, seq_list

In [16]:
train_dataset_path = 'train.csv' #'./dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = 'valid.csv' #'/content/drive/MyDrive/THESIS - INDOBERT/pertanyaan/valid.csv' #'./dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = 'test.csv' #'/content/drive/MyDrive/THESIS - INDOBERT/pertanyaan/test.csv' #'./dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [17]:
train_dataset = EQClassificationDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = EQClassificationDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = EQClassificationDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = EQClassificationDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
valid_loader = EQClassificationDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)  
test_loader = EQClassificationDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

print(train_dataset)

<__main__.EQClassificationDataset object at 0x7f55dd17fbd0>


  cpuset_checked))


In [18]:
w2i, i2w = EQClassificationDataset.LABEL2INDEX, EQClassificationDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'Mudah': 0, 'Sedang': 1, 'Sulit': 2}
{0: 'Mudah', 1: 'Sedang', 2: 'Sulit'}


## Test model on sample sentences

In [19]:
text = 'Susunan yang tepat untuk petunjuk tersebut adalah …'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Susunan yang tepat untuk petunjuk tersebut adalah … | Label : Sedang (36.864%)


In [20]:
text = 'Makna syair tersebut adalah'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Makna syair tersebut adalah | Label : Sedang (36.871%)


In [21]:
text = 'Kalimat utama pada paragraf tersebut adalah'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Kalimat utama pada paragraf tersebut adalah | Label : Mudah (37.017%)


In [22]:
text = 'Watak dari tokoh penyihir tersebut adalah'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Watak dari tokoh penyihir tersebut adalah | Label : Sedang (35.983%)


## Fine Tuning & Evaluation

In [23]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [24]:
# Train
n_epochs = 12
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

  cpuset_checked))
(Epoch 1) TRAIN LOSS:1.1008 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.76it/s]


(Epoch 1) TRAIN LOSS:1.1008 ACC:0.39 F1:0.33 REC:0.38 PRE:0.35 LR:0.00000300


  cpuset_checked))
VALID LOSS:1.0649 ACC:0.52 F1:0.51 REC:0.53 PRE:0.52: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]


(Epoch 1) VALID LOSS:1.0649 ACC:0.52 F1:0.51 REC:0.53 PRE:0.52


  cpuset_checked))
(Epoch 2) TRAIN LOSS:1.0641 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.87it/s]


(Epoch 2) TRAIN LOSS:1.0641 ACC:0.46 F1:0.45 REC:0.45 PRE:0.45 LR:0.00000300


  cpuset_checked))
VALID LOSS:1.0228 ACC:0.52 F1:0.48 REC:0.48 PRE:0.53: 100%|██████████| 2/2 [00:01<00:00,  1.68it/s]


(Epoch 2) VALID LOSS:1.0228 ACC:0.52 F1:0.48 REC:0.48 PRE:0.53


  cpuset_checked))
(Epoch 3) TRAIN LOSS:1.0172 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.90it/s]


(Epoch 3) TRAIN LOSS:1.0172 ACC:0.50 F1:0.48 REC:0.49 PRE:0.51 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.9845 ACC:0.54 F1:0.51 REC:0.50 PRE:0.54: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s]


(Epoch 3) VALID LOSS:0.9845 ACC:0.54 F1:0.51 REC:0.50 PRE:0.54


  cpuset_checked))
(Epoch 4) TRAIN LOSS:0.9718 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.90it/s]


(Epoch 4) TRAIN LOSS:0.9718 ACC:0.56 F1:0.55 REC:0.56 PRE:0.55 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.9277 ACC:0.61 F1:0.57 REC:0.56 PRE:0.60: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]


(Epoch 4) VALID LOSS:0.9277 ACC:0.61 F1:0.57 REC:0.56 PRE:0.60


  cpuset_checked))
(Epoch 5) TRAIN LOSS:0.9181 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s]


(Epoch 5) TRAIN LOSS:0.9181 ACC:0.65 F1:0.64 REC:0.65 PRE:0.65 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.8701 ACC:0.67 F1:0.63 REC:0.62 PRE:0.68: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s]


(Epoch 5) VALID LOSS:0.8701 ACC:0.67 F1:0.63 REC:0.62 PRE:0.68


  cpuset_checked))
(Epoch 6) TRAIN LOSS:0.8426 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.87it/s]


(Epoch 6) TRAIN LOSS:0.8426 ACC:0.71 F1:0.70 REC:0.70 PRE:0.71 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.8024 ACC:0.70 F1:0.68 REC:0.67 PRE:0.70: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s]


(Epoch 6) VALID LOSS:0.8024 ACC:0.70 F1:0.68 REC:0.67 PRE:0.70


  cpuset_checked))
(Epoch 7) TRAIN LOSS:0.7670 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.98it/s]


(Epoch 7) TRAIN LOSS:0.7670 ACC:0.77 F1:0.77 REC:0.77 PRE:0.77 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.7483 ACC:0.74 F1:0.71 REC:0.70 PRE:0.75: 100%|██████████| 2/2 [00:01<00:00,  1.58it/s]


(Epoch 7) VALID LOSS:0.7483 ACC:0.74 F1:0.71 REC:0.70 PRE:0.75


  cpuset_checked))
(Epoch 8) TRAIN LOSS:0.6975 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.86it/s]


(Epoch 8) TRAIN LOSS:0.6975 ACC:0.79 F1:0.77 REC:0.78 PRE:0.80 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.6751 ACC:0.80 F1:0.78 REC:0.77 PRE:0.81: 100%|██████████| 2/2 [00:01<00:00,  1.59it/s]


(Epoch 8) VALID LOSS:0.6751 ACC:0.80 F1:0.78 REC:0.77 PRE:0.81


  cpuset_checked))
(Epoch 9) TRAIN LOSS:0.6216 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.84it/s]


(Epoch 9) TRAIN LOSS:0.6216 ACC:0.84 F1:0.84 REC:0.84 PRE:0.84 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.6175 ACC:0.83 F1:0.82 REC:0.81 PRE:0.85: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s]


(Epoch 9) VALID LOSS:0.6175 ACC:0.83 F1:0.82 REC:0.81 PRE:0.85


  cpuset_checked))
(Epoch 10) TRAIN LOSS:0.5566 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.85it/s]


(Epoch 10) TRAIN LOSS:0.5566 ACC:0.85 F1:0.84 REC:0.84 PRE:0.85 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.5889 ACC:0.80 F1:0.78 REC:0.77 PRE:0.81: 100%|██████████| 2/2 [00:01<00:00,  1.57it/s]


(Epoch 10) VALID LOSS:0.5889 ACC:0.80 F1:0.78 REC:0.77 PRE:0.81


  cpuset_checked))
(Epoch 11) TRAIN LOSS:0.4891 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.87it/s]


(Epoch 11) TRAIN LOSS:0.4891 ACC:0.88 F1:0.88 REC:0.88 PRE:0.88 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.5458 ACC:0.83 F1:0.82 REC:0.81 PRE:0.85: 100%|██████████| 2/2 [00:01<00:00,  1.60it/s]


(Epoch 11) VALID LOSS:0.5458 ACC:0.83 F1:0.82 REC:0.81 PRE:0.85


  cpuset_checked))
(Epoch 12) TRAIN LOSS:0.4299 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.89it/s]


(Epoch 12) TRAIN LOSS:0.4299 ACC:0.89 F1:0.88 REC:0.88 PRE:0.89 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.5181 ACC:0.85 F1:0.84 REC:0.82 PRE:0.87: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s]

(Epoch 12) VALID LOSS:0.5181 ACC:0.85 F1:0.84 REC:0.82 PRE:0.87





In [25]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

  cpuset_checked))
100%|██████████| 2/2 [00:01<00:00,  1.52it/s]

    index   label
0       0   Mudah
1       1   Mudah
2       2   Mudah
3       3   Sulit
4       4   Mudah
..    ...     ...
58     58  Sedang
59     59   Mudah
60     60   Mudah
61     61   Mudah
62     62   Mudah

[63 rows x 2 columns]





## Test fine-tuned model on sample sentences

In [26]:
text = 'Prediksi kejadian berdasarkan isi teks Apa yang terjadi jika pola hidup sehat tidak dibiasakan sejak kecil?'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Prediksi kejadian berdasarkan isi teks Apa yang terjadi jika pola hidup sehat tidak dibiasakan sejak kecil? | Label : Sulit (68.850%)


In [27]:
text = 'Antonim/sinonim Antonim kata keras pada kalimat tersebut adalah'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Antonim/sinonim Antonim kata keras pada kalimat tersebut adalah | Label : Mudah (91.437%)


In [31]:
text = 'Unsur instrinsik karya sastra (tokoh, latar, watak tokoh, amanat) Latar tempat pada cerita tersebut adalah?'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Unsur instrinsik karya sastra (tokoh, latar, watak tokoh, amanat) Latar tempat pada cerita tersebut adalah? | Label : Mudah (72.274%)


In [29]:
text = 'Informasi tersurat teks Bagaimana cara memenuhi gizi anak pada usia balita?'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Informasi tersurat teks Bagaimana cara memenuhi gizi anak pada usia balita? | Label : Sedang (89.747%)
