# Segment Input and Tokenizer

In [None]:
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!java -version
!pip install py_vncorenlp
!pip install vncorenlp

In [None]:
from vncorenlp import VnCoreNLP
from transformers import AutoTokenizer

rdrsegmenter = VnCoreNLP('/kaggle/input/sentiment-analysis/vncorenlp/vncorenlp/VnCoreNLP-1.1.1.jar', annotators="wseg", max_heap_size='-Xmx500m')
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [None]:
text = "Tôi là sinh viên trường Đại học Bách Khoa Hà Nội"
words = rdrsegmenter.tokenize(text)

encode = []
for word in words:
    encode.append(tokenizer.encode(word))

for i in range(len(encode)):
    print(f'Word: {words[i]}, Encode: {encode[i]}\n')

### Preprocessing

In [None]:
import numpy as np
import re

def strip_emoji(text):
	RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
	return RE_EMOJI.sub(r'', text)

def remove_special_char(text):
	special_character = re.compile("�+")
	return special_character.sub(r'', text)

def remove_punctuation(text):
	punctuation = re.compile(r"[!#$%&()*+;<=>?@[\]^_`{|}~]")
	return punctuation.sub(r"", text)

def remove_number(text):
	return re.sub(" \d+", " ", text)

def normalize_annotatation(text):
	khach_san = "\bkhach san ?|\bksan ?|\bks ?"
	return re.sub("\bnv ?", "nhân viên",re.sub(khach_san, "khách sạn", text))

def clean_text(text):
	return {"Review": normalize_annotatation(remove_number(remove_special_char(remove_punctuation(strip_emoji(text["Review"].lower())))))}

train_data = []
train_set_path = '/kaggle/input/vqadat/vaq2.0.TrainImages.txt'

with open(train_set_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        temp = line.split('\t')
        qa = temp[1].split('?')

        if len(qa) == 3:
            answer = qa[2].strip()
        else:
            answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0][:-2],
            'question': qa[0] + '?',
            'answer': answer
        }
        train_data.append(data_sample)

class Preprocess():
    def __init__(self, tokenizer, rdrsegmenter):
        self.tokenizer = tokenizer
        self.rdrsegmenter = rdrsegmenter
        self.feature = ['giai_tri', 'luu_tru', 'nha_hang', 'an_uong', 'di_chuyen', 'mua_sam']

    def segment(self, example):
        return {"Segment": " ".join([" ".join(sen) for sen in self.rdrsegmenter.tokenize(example["Review"])])}
 
    def tokenize(self, example):
        return self.tokenizer(example["Segment"], truncation=True)
    
    def label(self, example):
        return {'labels_regressor': np.array([example[i] for i in self.feature]),
            'labels_classifier': np.array([int(example[i] != 0) for i in self.feature])}
        
    def run(self, dataset):
        dataset = dataset.map(clean_text)
        dataset = dataset.map(self.segment)
        dataset = dataset.map(self.tokenize, batched=True)
        dataset = dataset.map(self.label)
        dataset = dataset.remove_columns(['Unnamed: 0','Review', 'giai_tri', 'luu_tru', 'nha_hang', 'an_uong', 'di_chuyen', 'mua_sam', 'Segment'])
        dataset.set_format("torch")
        
        return dataset

# Load Dataset

In [None]:
import pandas as pd
from datasets import load_dataset

data_files = {
    'train': '/kaggle/input/sentiment-analysis/data/data/vi/train_datasets.csv',
    'test': '/kaggle/input/sentiment-analysis/data/data/vi/test_datasets.csv',
}

dataset = load_dataset('csv', data_files=data_files)

In [None]:
preprocess = Preprocess(tokenizer, rdrsegmenter)
tokenizer_datasets = preprocess.run(dataset)

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# Dataloader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(tokenizer_datasets['train'], collate_fn=data_collator, batch_size=32, shuffle=True)
test_dataloader = DataLoader(tokenizer_datasets['test'], collate_fn=data_collator, batch_size=32)

# Model

In [None]:
import torch.nn as nn

class CustomModelSoftmax(nn.Module):
	def __init__(self, checkpoint):
		super(CustomModelSoftmax, self).__init__()
		self.model = model = AutoModel.from_config(AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
		self.dropout = nn.Dropout(0.1)
		self.classifier = nn.Linear(768*4, 6)
		self.regressor = nn.Linear(768*4, 30)
  
	def forward(self, input_ids=None, attention_mask=None):
		outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
		outputs = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)

		outputs = self.dropout(outputs)
  
		outputs_classifier = self.classifier(outputs)
		outputs_regressor = self.regressor(outputs)
  
		outputs_classifier = nn.Sigmoid()(outputs_classifier)
		outputs_regressor = outputs_regressor.reshape(-1, 6, 5)
  
		return outputs_classifier, outputs_regressor

In [None]:
import torch
from transformers import AutoModel, AutoConfig

# Model 
model = CustomModelSoftmax("vinai/phobert-base")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Training

### Loss

In [None]:
import torch.nn.functional as F
import torch
import torch.nn as nn

def loss_classifier(pred_classifier, labels_classifier):
    return nn.BCELoss()(pred_classifier, labels_classifier)

def loss_regressor(pred_regressor, labels_regressor):
    mask = (labels_regressor != 0)
    loss = ((pred_regressor - labels_regressor)**2)[mask].sum() / mask.sum()
    return loss

def loss_softmax(inputs, labels, device):
    mask = (labels != 0)
    # inputs (N, 6, 5)
    n, aspect, rate = inputs.shape
    num = 0
    loss = torch.zeros(labels.shape).to(device)
    for i in range(aspect):
        label_i = labels[:, i].clone()
        label_i[label_i != 0] -= 1
        label_i = label_i.type(torch.LongTensor).to(device)
        loss[:, i] = nn.CrossEntropyLoss(reduction='none')(inputs[:, i, :], label_i)
    loss = loss[mask].sum() / mask.sum()
    return loss

def sigmoid_focal_loss(
    inputs: torch.Tensor,
    targets: torch.Tensor,
    alpha: float = 0.25,
    gamma: float = 2,
    reduction: str = "none",):

    # p = torch.sigmoid(inputs)
    p = inputs
    ce_loss = F.binary_cross_entropy(inputs, targets, reduction="none")
    p_t = p * targets + (1 - p) * (1 - targets)
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    if reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()

    return loss

def bce_loss_weights(inputs, targets, weights):
    ce_loss = F.binary_cross_entropy(inputs, targets, reduction="none")
    weights = targets*(1 / weights.view(1, -1)) + (1 - targets)*(1 / (1 - weights.view(1, -1)))
    loss = ce_loss*weights
    return loss.mean()


def CB_loss(inputs, targets, samples_positive_per_cls, samples_negative_per_cls, no_of_classes=2,loss_type='sigmoid', beta=0.9999, gamma=2):
    samples_per_cls = torch.concat([samples_positive_per_cls.unsqueeze(-1), samples_negative_per_cls.unsqueeze(-1)], dim=-1) # num_cls, 2
    effective_num = 1.0 - torch.pow(beta, samples_per_cls) # num_cls, 2
    weights = (1.0 - beta) / effective_num # num_cls, 2
    weights = weights / weights.sum(dim=-1).reshape(-1, 1) * no_of_classes # num_cls, 2 
    weights = targets*weights[:, 0] + (1 - targets)*weights[:, 1]

    if loss_type == "focal":
        cb_loss = (sigmoid_focal_loss(inputs, targets)*weights).mean()
    elif loss_type == "sigmoid":
        cb_loss = (F.binary_cross_entropy(inputs,targets, reduction="none")*weights).mean()
    return cb_loss

In [None]:
from transformers import AdamW, get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

### Metrics

In [None]:
import numpy as np

class ScalarMetric():
    def __init__(self):
        self.scalar = 0
        self.num = 0
    def update(self, scalar):
        self.scalar += scalar
        self.num += 1
        return self
    def compute(self):
        return self.scalar / self.num
    def reset(self):
        self.scalar = 0
        self.num = 0

class AccuracyMetric():
    def __init__(self):
        self.correct = 0
        self.num = 0
    def update(self, y_pred, y_true):
        self.correct += (y_pred == y_true).sum()
        self.num += len(y_pred)*y_pred.shape[1]
    def compute(self):
        return self.correct / self.num
    def reset(self):
        self.correct = 0
        self.num = 0

def precision(y_pred, y_true):
    true_positive = np.logical_and(y_pred, y_true).sum(axis=0)
    false_positive = np.logical_and(y_pred, np.logical_not(y_true)).sum(axis=0)
    return true_positive / (true_positive + false_positive)

def recall(y_pred, y_true):
    true_positive = np.logical_and(y_pred, y_true).sum(axis=0)
    false_negative = np.logical_and(np.logical_not(y_pred), y_true).sum(axis=0)
    return true_positive / (true_positive + false_negative)

class F1_score():
    def __init__(self):
        self.y_pred = None
        self.y_true = None
    def update(self, y_pred, y_true):
        self.y_pred = np.concatenate([self.y_pred, y_pred], axis=0) if self.y_pred is not None else y_pred
        self.y_true = np.concatenate([self.y_true, y_true], axis=0) if self.y_true is not None else y_true
    def compute(self):
        f1_score = np.zeros(self.y_pred.shape[1])
        precision_score = precision(self.y_pred != 0, self.y_true != 0)
        recall_score = recall(self.y_pred != 0, self.y_true != 0)
        mask_precision_score = np.logical_and(precision_score != 0, np.logical_not(np.isnan(precision_score)))
        mask_recall_score = np.logical_and(recall_score != 0, np.logical_not(np.isnan(recall_score)))
        mask = np.logical_and(mask_precision_score, mask_recall_score)
        print("Precision:",precision_score)
        print("Recall", recall_score)
        f1_score[mask] = 2* (precision_score[mask] * recall_score[mask]) / (precision_score[mask] + recall_score[mask])
        return f1_score

class R2_score():
    def __init__(self):
        self.y_pred = None
        self.y_true = None

    def update(self, y_pred, y_true):
        self.y_pred = np.concatenate([self.y_pred, y_pred], axis=0) if self.y_pred is not None else y_pred
        self.y_true = np.concatenate([self.y_true, y_true], axis=0) if self.y_true is not None else y_true
    
    def compute(self):
        mask = np.logical_and(self.y_pred !=0, self.y_true != 0)
        rss = (((self.y_pred - self.y_true)**2)*mask).sum(axis=0) 
        k = (mask*16).sum(axis=0)
        r2_score = np.ones(rss.shape[0])
        mask2 = (k != 0)
        r2_score[mask2] = 1 - rss[mask2]/k[mask2]
        return r2_score

In [None]:
import numpy as np

def split_train_test(data, test_size):
	shuffled = np.random.permutation(len(data))
	num_test = int(test_size*len(data))
	test_index = shuffled[:num_test]
	train_index = shuffled[num_test:]
	return data.iloc[train_index], data.iloc[test_index]

def prob_to_label_1(pred):
	mask = (pred >= 0.5)
	x_coor, y_coor = np.where(mask)
	result = np.zeros((pred.shape[0], 6))
	for x, y in zip(x_coor, y_coor):
		loc = y // 6
		star = y % 6
		result[x][loc] = star
	return result

def prob_to_label_2(pred):
	result = np.zeros((pred.shape[0], 6))
	pred = pred.reshape(pred.shape[0], -1, 5)
	star = pred.argmax(axis=-1) + 1
	prob = pred.max(axis=-1)
	mask = prob >= 0.5
	result[mask] = star[mask]
	return result

def pred_to_label(outputs_classifier, outputs_regressor):
	"""Convert output model to label. Get aspects have reliability >= 0.5

	Args:
		outputs_classifier (numpy.array): Output classifier layer
		outputs_regressor (numpy.array): Output regressor layer

	Returns:
		predicted label
	"""
	result = np.zeros((outputs_classifier.shape[0], 6))
	mask = (outputs_classifier >= 0.5)
	result[mask] = outputs_regressor[mask]
	return result

In [None]:
from tqdm.auto import tqdm 

# Training
pb_train = tqdm(range(num_training_steps))
pb_test = tqdm(range(num_epochs*len(test_dataloader)))
best_score = -1

for epoch in range(num_epochs):
    train_loss = 0
    val_loss = 0
    
    # Train
    model.train()
    for batch in train_dataloader:
        inputs = {'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)}
        outputs_classifier, outputs_regressor = model(**inputs)
        loss1 = sigmoid_focal_loss(outputs_classifier, batch['labels_classifier'].to(device).float(), alpha=-1, gamma=1,reduction='mean')
        loss2 = loss_softmax(outputs_regressor, batch['labels_regressor'].to(device).float(), device)
        loss = 10*loss1 + loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()       
        lr_scheduler.step()
        pb_train.update(1)
        pb_train.set_postfix(loss_classifier=loss1.item(),loss_regressor=loss2.item(),loss=loss.item())
        train_loss += loss.item() / len(train_dataloader)
    print("Train Loss:", train_loss)
    
    # Evaluate
    # model.eval()
    val_loss = ScalarMetric()
    val_loss_classifier = ScalarMetric()
    val_loss_regressor = ScalarMetric()
    val_acc = AccuracyMetric()
    val_f1_score = F1_score()
    val_r2_score = R2_score()
    num = 0
    correct = 0
    result = None
    model.eval()
    for batch in test_dataloader:
        inputs = {'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)}
        with torch.no_grad():
            outputs_classifier, outputs_regressor = model(**inputs)
            loss1 = loss_classifier(outputs_classifier, batch['labels_classifier'].to(device).float())
            loss2 = loss_softmax(outputs_regressor, batch['labels_regressor'].to(device).float(), device)
            loss = loss1 + loss2
            outputs_classifier = outputs_classifier.cpu().numpy()
            outputs_regressor = outputs_regressor.cpu().numpy()
            outputs_regressor = outputs_regressor.argmax(axis=-1) + 1
            y_true = batch['labels_regressor'].numpy()
            outputs = pred_to_label(outputs_classifier, outputs_regressor)
            result = np.concatenate([result, np.round(outputs)], axis=0) if result is not None else np.round(outputs)
            val_loss_classifier.update(loss1.item())
            val_loss_regressor.update(loss2.item())
            val_loss.update(loss.item())
            val_acc.update(np.round(outputs), y_true)
            val_f1_score.update(np.round(outputs), y_true)
            val_r2_score.update(np.round(outputs), y_true)
            pb_test.update(1)
            
    f1_score = val_f1_score.compute()
    r2_score = val_r2_score.compute()
    final_score = (f1_score * r2_score).sum()*1/6
    
    if final_score > best_score:
        best_score = final_score
        torch.save(model.state_dict(), "/kaggle/working/model.pt")
        
    print("Test Loss:", val_loss.compute(), "Loss Classifier:", val_loss_classifier.compute(), "Loss Regressor:", val_loss_regressor.compute())
    print("Acc", val_acc.compute())
    print("F1_score", f1_score)
    print("R2_score", r2_score)
    print("Final_score", final_score)
    print("Best_score", best_score)