# Install library

In [None]:
!pip install -u pandas
!pip install -u scikit-learn
!pip install -u transformers



# Import library

In [None]:
import torch
from torch import Tensor, nn
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import classification_report, f1_score
from datetime import datetime

# Define hyperparammeter

In [None]:
epochs = 4
batch_size = 16
init_lr = 2e-5

# Sentiment
First we finetune model to predict sentiment

## load data
I download dataset from [kaggle](https://www.kaggle.com/datasets/toreleon/synthetic-vietnamese-students-feedback-corpus/data)

### Read csv

In [None]:
df_train = pd.read_csv("/kaggle/input/synthetic-vietnamese-students-feedback-corpus/synthetic_train.csv")
df_test = pd.read_csv("/kaggle/input/synthetic-vietnamese-students-feedback-corpus/synthetic_val.csv")

### Review data
- Dataset have three column sentence and sentiment. We will use sentence as input and sentiment is output.
- We can see that this data is belance because all classes have same number of samples.
- Then, we count number words in each sentence. We can see that max word is 48. We don't need use large input size.

In [None]:
df_train.head()

Unnamed: 0,sentence,sentiment,topic
0,Đội ngũ bảo trì quá thưa thớt dẫn đến không đả...,negative,facility
1,The university's musical and artistic faciliti...,neutral,facility
2,Phương pháp giảng dạy phù hợp với các đối tượn...,neutral,curriculum
3,Chương trình học giúp tôi trở thành một chuyên...,positive,curriculum
4,Tôi nghĩ rằng chương trình đào tạo có thể có t...,neutral,curriculum


In [None]:
df_test.head()

Unnamed: 0,sentence,sentiment,topic
0,Chất lượng vật chất kém.,negative,facility
1,"Phần mềm học tập quá khó sử dụng, khiến sinh v...",negative,facility
2,Trường tôi thiếu những tiện ích cơ bản như máy...,negative,facility
3,Cần tạo thêm các hoạt động gắn kết giữa sinh v...,neutral,curriculum
4,Họ rất khoan dung và lượng giác trong quan điể...,neutral,others


In [None]:
df_train.sentiment.value_counts()

sentiment
neutral     2724
negative    2711
positive    2709
Name: count, dtype: int64

In [None]:
df_test.sentiment.value_counts()

sentiment
negative    686
positive    680
neutral     670
Name: count, dtype: int64

In [None]:
df_train['len'] = df_train.sentence.apply(lambda x: len(str(x).split()))
df_test['len'] = df_test.sentence.apply(lambda x: len(str(x).split()))

In [None]:
df_train['len'].describe()

count    8144.000000
mean       15.549730
std         5.018764
min         3.000000
25%        12.000000
50%        15.000000
75%        18.000000
max        43.000000
Name: len, dtype: float64

In [None]:
df_test['len'].describe()

count    2036.000000
mean       15.694990
std         5.185957
min         2.000000
25%        12.000000
50%        15.000000
75%        19.000000
max        48.000000
Name: len, dtype: float64

## Create Dataloader

### Create tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

### Create TextDataset
I created TextDataset based on Torch Dataset. This class takes DataFrame df and tokenizer as input.
TextDataset has two main functions: init function (init object) and __getitem__ (function to sample batch data)
- In the init function, I create a label column by converting the sentiment column to a number (positive --> 0, neutral --> 1, negative --> 2).
- In the __getitem__  function:
  - The input of this function is an index (torch Dataset auto input index when training, we don't need to do anything more).
  - we get text input and label of this index row in DataFrame, convert text to input_ids and attention_mask with tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', max_length=128).
  - I set return_tensors = 'pt' to return torch tensor (instead of number or tf format). Setting padding to 'max_length' then tokenizer auto adds padding and max_length = 128 to cut text no longer than 128.

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.df = self.df.dropna(subset = ['sentence', 'sentiment'])
        print(self.df['sentiment'].value_counts())
        self.df['label'] = self.df['sentiment'].map(SENTIMENT_MAPPER)
        print(self.df['label'].value_counts())
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = str(self.df.iloc[index]['sentence'])
        label = self.df.iloc[index]['label']

        # Tokenize the post and comment
        input_ids = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', max_length=128)

        attention_mask = input_ids["attention_mask"][:, :128].reshape(-1)
        input_ids = input_ids["input_ids"][:, :128].reshape(-1)

        return (input_ids, attention_mask, label)

SENTIMENT_MAPPER = {'positive': 0, 'neutral': 1, 'negative': 2}

# Load the data
train_dataset = TextDataset(df_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

sentiment
neutral     2724
negative    2711
positive    2709
Name: count, dtype: int64
label
1    2724
2    2711
0    2709
Name: count, dtype: int64
sentiment
negative    686
positive    680
neutral     670
Name: count, dtype: int64
label
2    686
0    680
1    670
Name: count, dtype: int64


## Create model

### Define model

In [None]:
class MODEL(nn.Module):
    def __init__(self, num_classes):
        super(MODEL, self).__init__()
        self.model = AutoModel.from_pretrained('intfloat/multilingual-e5-base')
        self.dropout = nn.Dropout(0.3)
        self.linear_layer = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        hidden_states = self.model(input_ids, attention_mask).pooler_output
        hidden_states = self.dropout(hidden_states)
        outpit = self.linear_layer(hidden_states)
        return outpit

# Initialize the model
model = MODEL(3)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

### Create optimizer
- The first layers need lower learning rates, and the later ones need higher ones. So I divided layers into 4 groups:
  - layers 0-->3 with learning rate = init_lr
  - layers 4-->7 with learning rate = 1.5 * init_lr
  - layers 8-->11 with learning_rate = 2 * init_lr
  - linear or pooler with learning_rate = 3 * initial_lr.
- With bias and LayerNorm, I do not use weight decay so I will set the weight decay of bias and LayerNorm to 0.
- To customize learning_rate and weight_decay for each layer, push dictionary {'param': layer's params, 'lr': layer's learning rate, 'weight_decay': layer's weight_decay} to the optimized_parameters list. Then set optimizer = torch.optim.AdamW(optimized_parameters)

In [None]:
optimized_parameters = []
no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]

for name, param in model.named_parameters():
    lr = init_lr
    weight_decay = 0.0 if any(nd in name for nd in no_decay) else 0.01

    if name in ['linear_layer.weight', 'linear_layer.bias'] or 'pooler' in name or 'LinearTransformation' in name:
        lr = init_lr * 3
    elif 'layer.11' in name or 'layer.10' in name or 'layer.9' in name or 'layer.8' in name:
        lr = init_lr * 2
    elif 'layer.7' in name or 'layer.6' in name or 'layer.5' in name or 'layer.4' in name:
        lr = init_lr * 1.5
    else:
        lr = init_lr

    optimized_parameters.append({'params': param,
                                     'weight_decay': weight_decay,
                                     'lr': lr})


optimizer = torch.optim.AdamW(optimized_parameters)

### Create criterion and learning_rate scheduler
I use torch.cuda.amp.GradScaler to enable 16 bit training (save memory and time)

In [None]:
scaler = torch.cuda.amp.GradScaler()
criterion = nn.CrossEntropyLoss(label_smoothing = 0.2)
total_steps = epochs * len(train_dataloader)
print(total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=min(500, total_steps//10),
                                            num_training_steps=total_steps)

2036


## Training
Just create training and eval loop normally. To enable 16 bit training:
- use torch.cuda.amp.autocast to predict and calculate loss --> Pytorch will do forward and calculate loss in 16 bit (cut time and memory).
- use scaler.scale(loss) to scale 16 bit loss back to 32 bit (because model 32 bit can't update with 16 bit loss backward).
- use scaler.step(optimizer) and scaler.update() to ensure optimizer.step do correctly.

In [None]:
max_acc = 0
max_f1 = 0

start_train = datetime.now()
model = model.cuda()

for epoch in range(epochs):
    train_acc = 0
    len_train = 0
    model.train()
    start_epoch = datetime.now()

    for batch_idx, (input_ids, attention_mask, label) in enumerate(train_dataloader):
            optimizer.zero_grad()
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 3).float()

            with torch.cuda.amp.autocast():
                predict = model(input_ids, attention_mask)
                loss = criterion(predict, label)
            scaler.scale(loss).backward() #loss.backward()
            scaler.step(optimizer) #optimizer.step()

            scaler.update()

            scheduler.step()

            with torch.no_grad():
                    predict = predict.argmax(-1)
                    train_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_train += len(label)

            if batch_idx % 50 == 0:
                print(batch_idx, train_acc / len_train * 100, loss)

            #break
            #torch.cuda.empty_cache()

    train_acc = train_acc / len_train

    test_acc = 0
    len_test = 0
    model.eval()

    test_labels = []
    test_predicts = []

    for batch_idx, (input_ids, attention_mask, label) in enumerate(test_dataloader):
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 3).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    predict = model(input_ids, attention_mask)
                    loss = criterion(predict, label)

                    predict = predict.argmax(-1)
                    test_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_test += len(label)

                    test_labels += label.argmax(-1).cpu().numpy().tolist()
                    test_predicts += predict.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc / len_test * 100, loss)

    test_acc = test_acc / len_test

    f1 = f1_score(test_predicts, test_labels, average = 'macro')

    if test_acc > max_acc:
        max_acc = test_acc
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "sentiment_max_acc.pt")

    if f1 > max_f1:
        max_f1 = f1
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "sentiment_max_f1.pt")

    if (epoch + 1) % 1 == 0:
        print(f'Epoch {epoch + 1}, train_acc: {train_acc}, test_acc: {test_acc}, f1: {f1}, epoch_time: {datetime.now() - start_epoch}')

print('total training time: ', datetime.now() - start_train)

0 tensor(56.2500) tensor(1.0808, device='cuda:0', grad_fn=<DivBackward1>)
50 tensor(40.5637) tensor(1.0521, device='cuda:0', grad_fn=<DivBackward1>)
100 tensor(56.0644) tensor(0.7898, device='cuda:0', grad_fn=<DivBackward1>)
150 tensor(63.3692) tensor(0.7432, device='cuda:0', grad_fn=<DivBackward1>)
200 tensor(67.5995) tensor(0.7091, device='cuda:0', grad_fn=<DivBackward1>)
250 tensor(70.5179) tensor(0.7626, device='cuda:0', grad_fn=<DivBackward1>)
300 tensor(72.0723) tensor(0.6368, device='cuda:0', grad_fn=<DivBackward1>)
350 tensor(73.7714) tensor(0.6966, device='cuda:0', grad_fn=<DivBackward1>)
400 tensor(74.7039) tensor(0.7510, device='cuda:0', grad_fn=<DivBackward1>)
450 tensor(75.8731) tensor(0.7449, device='cuda:0', grad_fn=<DivBackward1>)
500 tensor(76.5469) tensor(0.6283, device='cuda:0', grad_fn=<DivBackward1>)
0 tensor(87.5000) tensor(0.5952, device='cuda:0')
100 tensor(86.2624) tensor(0.5576, device='cuda:0')
Epoch 1, train_acc: 0.7660854458808899, test_acc: 0.8610019683837

## Eval
You can see good results with accuracy of 89.54%

In [None]:
test_acc = 0
len_test = 0

model.load_state_dict(torch.load("sentiment_max_f1.pt")['model'])
test_labels = []
test_predicts = []

for batch_idx, (input_ids, attention_mask, label) in enumerate(test_dataloader):
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 3).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    predict = model(input_ids, attention_mask)
                    loss = criterion(predict, label)

                    predict = predict.argmax(-1)
                    test_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_test += len(label)

                    test_labels += label.argmax(-1).cpu().numpy().tolist()
                    test_predicts += predict.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc / len_test * 100, loss)

test_acc = test_acc / len_test
f1 = f1_score(test_predicts, test_labels, average = 'macro')

print(test_acc, f1)

0 tensor(100.) tensor(0.5072, device='cuda:0')
100 tensor(89.7277) tensor(0.6711, device='cuda:0')
tensor(0.8954) 0.8948048705360745


In [None]:
## label smoothing 0.2
print(sklearn.metrics.classification_report(test_labels, test_predicts, digits=4))

              precision    recall  f1-score   support

           0     0.8446    0.8794    0.8617       680
           1     0.8558    0.8239    0.8395       670
           2     0.9854    0.9810    0.9832       686

    accuracy                         0.8954      2036
   macro avg     0.8953    0.8948    0.8948      2036
weighted avg     0.8957    0.8954    0.8953      2036



# topic
Do same thing like sentiment training

## Review data
Dataset is balanced

In [None]:
df_train.topic.value_counts()

topic
others        2059
curriculum    2040
lecturer      2026
facility      2019
Name: count, dtype: int64

In [None]:
df_test.topic.value_counts()

topic
facility      526
lecturer      515
curriculum    507
others        488
Name: count, dtype: int64

## Create TextDataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.df = self.df.dropna(subset = ['sentence', 'topic'])
        print(self.df['topic'].value_counts())
        self.df['label'] = self.df['topic'].map(TOPIC_MAPPER)
        print(self.df['label'].value_counts())
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = str(self.df.iloc[index]['sentence'])
        label = self.df.iloc[index]['label']

        # Tokenize the post and comment
        input_ids = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', max_length=128)

        attention_mask = input_ids["attention_mask"][:, :128].reshape(-1)
        input_ids = input_ids["input_ids"][:, :128].reshape(-1)

        return (input_ids, attention_mask, label)

TOPIC_MAPPER = {'facility': 0,
               'lecturer': 1,
               'curriculum': 2,
               'others': 3}

# Load the data
train_dataset = TextDataset(df_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

topic
others        2059
curriculum    2040
lecturer      2026
facility      2019
Name: count, dtype: int64
label
3    2059
2    2040
1    2026
0    2019
Name: count, dtype: int64
topic
facility      526
lecturer      515
curriculum    507
others        488
Name: count, dtype: int64
label
0    526
1    515
2    507
3    488
Name: count, dtype: int64


## Create model, optimizer, scheduler

In [None]:
model = MODEL(4)

In [None]:
optimized_parameters = []
no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]

for name, param in model.named_parameters():
    lr = init_lr
    weight_decay = 0.0 if any(nd in name for nd in no_decay) else 0.01

    if name in ['linear_layer.weight', 'linear_layer.bias'] or 'pooler' in name or 'LinearTransformation' in name:
        lr = init_lr * 3
    elif 'layer.11' in name or 'layer.10' in name or 'layer.9' in name or 'layer.8' in name:
        lr = init_lr * 2
    elif 'layer.7' in name or 'layer.6' in name or 'layer.5' in name or 'layer.4' in name:
        lr = init_lr * 1,5
    else:
        lr = init_lr

    optimized_parameters.append({'params': param,
                                     'weight_decay': weight_decay,
                                     'lr': lr})


optimizer = torch.optim.AdamW(optimized_parameters)

In [None]:
scaler = torch.cuda.amp.GradScaler()
criterion = nn.CrossEntropyLoss(label_smoothing = 0.2)
total_steps = epochs * len(train_dataloader)
print(total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=min(500, total_steps//10),
                                            num_training_steps=total_steps)

2036


## Training

In [None]:
max_acc = 0
max_f1 = 0

start_train = datetime.now()
model = model.cuda()

for epoch in range(epochs):
    train_acc = 0
    len_train = 0
    model.train()
    start_epoch = datetime.now()

    for batch_idx, (input_ids, attention_mask, label) in enumerate(train_dataloader):
            optimizer.zero_grad()
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 4).float()

            with torch.cuda.amp.autocast():
                predict = model(input_ids, attention_mask)
                loss = criterion(predict, label)
            scaler.scale(loss).backward() #loss.backward()
            scaler.step(optimizer) #optimizer.step()

            scaler.update()

            scheduler.step()

            with torch.no_grad():
                    predict = predict.argmax(-1)
                    train_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_train += len(label)

            if batch_idx % 50 == 0:
                print(batch_idx, train_acc / len_train * 100, loss)

            #break
            #torch.cuda.empty_cache()

    train_acc = train_acc / len_train

    test_acc = 0
    len_test = 0
    model.eval()

    test_labels = []
    test_predicts = []

    for batch_idx, (input_ids, attention_mask, label) in enumerate(test_dataloader):
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 4).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    predict = model(input_ids, attention_mask)
                    loss = criterion(predict, label)

                    predict = predict.argmax(-1)
                    test_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_test += len(label)

                    test_labels += label.argmax(-1).cpu().numpy().tolist()
                    test_predicts += predict.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc / len_test * 100, loss)

    test_acc = test_acc / len_test

    f1 = f1_score(test_predicts, test_labels, average = 'macro')

    if test_acc > max_acc:
        max_acc = test_acc
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "topic_max_acc.pt")

    if f1 > max_f1:
        max_f1 = f1
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "topic_max_f1.pt")

    if (epoch + 1) % 1 == 0:
        print(f'Epoch {epoch + 1}, train_acc: {train_acc}, test_acc: {test_acc}, f1: {f1}, epoch_time: {datetime.now() - start_epoch}')

print('total training time: ', datetime.now() - start_train)

0 tensor(18.7500) tensor(1.4178, device='cuda:0', grad_fn=<DivBackward1>)
50 tensor(35.9069) tensor(1.3356, device='cuda:0', grad_fn=<DivBackward1>)
100 tensor(57.3639) tensor(0.7345, device='cuda:0', grad_fn=<DivBackward1>)
150 tensor(68.3361) tensor(0.7572, device='cuda:0', grad_fn=<DivBackward1>)
200 tensor(74.2226) tensor(0.6342, device='cuda:0', grad_fn=<DivBackward1>)
250 tensor(77.8635) tensor(0.6891, device='cuda:0', grad_fn=<DivBackward1>)
300 tensor(80.5233) tensor(0.7094, device='cuda:0', grad_fn=<DivBackward1>)
350 tensor(82.3718) tensor(0.7050, device='cuda:0', grad_fn=<DivBackward1>)
400 tensor(83.9931) tensor(0.6653, device='cuda:0', grad_fn=<DivBackward1>)
450 tensor(85.3104) tensor(0.7292, device='cuda:0', grad_fn=<DivBackward1>)
500 tensor(86.0654) tensor(0.5994, device='cuda:0', grad_fn=<DivBackward1>)
0 tensor(81.2500) tensor(0.9483, device='cuda:0')
100 tensor(94.6163) tensor(0.6998, device='cuda:0')
Epoch 1, train_acc: 0.8612475395202637, test_acc: 0.9440078735351

# Eval

In [None]:
test_acc = 0
len_test = 0

model.load_state_dict(torch.load("topic_max_f1.pt")['model'])
test_labels = []
test_predicts = []

for batch_idx, (input_ids, attention_mask, label) in enumerate(test_dataloader):
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 4).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    predict = model(input_ids, attention_mask)
                    loss = criterion(predict, label)

                    predict = predict.argmax(-1)
                    test_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_test += len(label)

                    test_labels += label.argmax(-1).cpu().numpy().tolist()
                    test_predicts += predict.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc / len_test * 100, loss)

test_acc = test_acc / len_test
f1 = f1_score(test_predicts, test_labels, average = 'macro')

print(test_acc, f1)

0 tensor(81.2500) tensor(0.8144, device='cuda:0')
100 tensor(96.5347) tensor(0.6849, device='cuda:0')
tensor(0.9646) 0.9648247393940987


In [None]:
## label smoothing 0.2
print(sklearn.metrics.classification_report(test_labels, test_predicts, digits=4))

              precision    recall  f1-score   support

           0     0.9497    0.9696    0.9595       526
           1     0.9710    0.9748    0.9729       515
           2     0.9452    0.9191    0.9320       507
           3     0.9939    0.9959    0.9949       488

    accuracy                         0.9646      2036
   macro avg     0.9650    0.9648    0.9648      2036
weighted avg     0.9646    0.9646    0.9645      2036



# Multitask
Training model to predict both sentiment and topic. We can see that performance is still good as training separately.

## Create TextDataset


In [None]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.df = self.df.dropna(subset = ['sentence', 'sentiment', 'topic'])
        self.df['label_sentiment'] = self.df['sentiment'].map(SENTIMENT_MAPPER)
        self.df['label_topic'] = self.df['topic'].map(TOPIC_MAPPER)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = str(self.df.iloc[index]['sentence'])
        label_sentiment = self.df.iloc[index]['label_sentiment']
        label_topic = self.df.iloc[index]['label_topic']

        # Tokenize the post and comment
        input_ids = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', max_length=128)

        attention_mask = input_ids["attention_mask"][:, :128].reshape(-1)
        input_ids = input_ids["input_ids"][:, :128].reshape(-1)

        return (input_ids, attention_mask, label_sentiment, label_topic)

SENTIMENT_MAPPER = {'positive': 0, 'neutral': 1, 'negative': 2}
TOPIC_MAPPER = {'facility': 0,
               'lecturer': 1,
               'curriculum': 2,
               'others': 3}

# Load the data
train_dataset = TextDataset(df_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Create model

In [None]:
class MODEL(nn.Module):
    def __init__(self):
        super(MODEL, self).__init__()
        self.model = AutoModel.from_pretrained('intfloat/multilingual-e5-base')
        self.dropout = nn.Dropout(0.3)
        self.sentiment = nn.Linear(768, 3)
        self.topic = nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        hidden_states = self.model(input_ids, attention_mask).pooler_output
        hidden_states = self.dropout(hidden_states)
        sentiment = self.sentiment(hidden_states)
        topic = self.topic(hidden_states)
        return sentiment, topic

# Initialize the model
model = MODEL()

## Create optimizer, scheduler

In [None]:
optimized_parameters = []
no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]

for name, param in model.named_parameters():
    lr = init_lr
    weight_decay = 0.0 if any(nd in name for nd in no_decay) else 0.01

    if name in ['sentiment.weight', 'sentiment.bias', 'topic.weight', 'topic.bias'] or 'pooler' in name or 'LinearTransformation' in name:
        lr = init_lr * 3
    elif 'layer.11' in name or 'layer.10' in name or 'layer.9' in name or 'layer.8' in name:
        lr = init_lr * 2
    elif 'layer.7' in name or 'layer.6' in name or 'layer.5' in name or 'layer.4' in name:
        lr = init_lr * 1.5
    else:
        lr = init_lr

    optimized_parameters.append({'params': param,
                                     'weight_decay': weight_decay,
                                     'lr': lr})


optimizer = torch.optim.AdamW(optimized_parameters)

In [None]:
scaler = torch.cuda.amp.GradScaler()
criterion = nn.CrossEntropyLoss(label_smoothing = 0.2)
total_steps = epochs * len(train_dataloader)
print(total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=min(500, total_steps//10),
                                            num_training_steps=total_steps)

2036


## Training

In [None]:
max_acc_sentiment = 0
max_f1_sentiment = 0
max_acc_topic = 0
max_f1_topic = 0

start_train = datetime.now()
model = model.cuda()

for epoch in range(epochs):
    train_acc_sentiment = 0
    train_acc_topic = 0
    len_train = 0
    model.train()
    start_epoch = datetime.now()

    for batch_idx, (input_ids, attention_mask, label_sentiment, label_topic) in enumerate(train_dataloader):
            optimizer.zero_grad()
            input_ids, attention_mask, label_sentiment, label_topic = input_ids.cuda(), attention_mask.cuda(), label_sentiment.cuda(), label_topic.cuda()
            label_sentiment = F.one_hot(label_sentiment.reshape(-1), 3).float()
            label_topic = F.one_hot(label_topic.reshape(-1), 4).float()

            with torch.cuda.amp.autocast():
                predict_sentiment, predict_topic = model(input_ids, attention_mask)
                loss = criterion(predict_sentiment, label_sentiment) + criterion(predict_topic, label_topic)
            scaler.scale(loss).backward() #loss.backward()
            scaler.step(optimizer) #optimizer.step()

            scaler.update()

            scheduler.step()

            with torch.no_grad():
                    predict_sentiment = predict_sentiment.argmax(-1)
                    predict_topic = predict_topic.argmax(-1)
                    train_acc_sentiment += (predict_sentiment == label_sentiment.argmax(-1)).sum().cpu()
                    train_acc_topic += (predict_topic == label_topic.argmax(-1)).sum().cpu()
                    len_train += len(label_sentiment)

            if batch_idx % 50 == 0:
                print(batch_idx, train_acc_sentiment / len_train * 100, train_acc_topic / len_train * 100, loss)

            #break
            #torch.cuda.empty_cache()

    train_acc_sentiment = train_acc_sentiment / len_train
    train_acc_topic = train_acc_topic / len_train

    test_acc_sentiment = 0
    test_acc_topic = 0
    len_test = 0
    model.eval()

    test_labels_sentiment = []
    test_predicts_sentiment = []
    test_labels_topic = []
    test_predicts_topic = []

    for batch_idx, (input_ids, attention_mask, label_sentiment, label_topic) in enumerate(test_dataloader):
            input_ids, attention_mask, label_sentiment, label_topic = input_ids.cuda(), attention_mask.cuda(), label_sentiment.cuda(), label_topic.cuda()
            label_sentiment = F.one_hot(label_sentiment.reshape(-1), 3).float()
            label_topic = F.one_hot(label_topic.reshape(-1), 4).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    predict_sentiment, predict_topic = model(input_ids, attention_mask)
                    loss = criterion(predict_sentiment, label_sentiment) + criterion(predict_topic, label_topic)

                    predict_sentiment = predict_sentiment.argmax(-1)
                    predict_topic = predict_topic.argmax(-1)

                    test_acc_sentiment += (predict_sentiment == label_sentiment.argmax(-1)).sum().cpu()
                    test_acc_topic += (predict_topic == label_topic.argmax(-1)).sum().cpu()
                    len_test += len(label_sentiment)

                    test_labels_sentiment += label_sentiment.argmax(-1).cpu().numpy().tolist()
                    test_predicts_sentiment += predict_sentiment.cpu().numpy().tolist()

                    test_labels_topic += label_topic.argmax(-1).cpu().numpy().tolist()
                    test_predicts_topic += predict_topic.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc_sentiment / len_test * 100, test_acc_topic / len_test * 100, loss)

    test_acc_sentiment = test_acc_sentiment / len_test
    test_acc_topic = test_acc_topic / len_test

    f1_sentiment = f1_score(test_predicts_sentiment, test_labels_sentiment, average = 'macro')
    f1_topic = f1_score(test_predicts_topic, test_labels_topic, average = 'macro')

    if test_acc_sentiment > max_acc_sentiment:
        max_acc_sentiment = test_acc_sentiment
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "topic_max_acc_sentiment.pt")

    if f1_sentiment > max_f1_sentiment:
        max_f1_sentiment = f1_sentiment
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "topic_max_f1_sentiment.pt")

    if test_acc_topic > max_acc_topic:
        max_acc_topic = test_acc_topic
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "topic_max_acc_topic.pt")

    if f1_topic > max_f1_topic:
        max_f1_topic = f1_topic
        #checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}
        checkpoint = {'model': model.state_dict()}
        torch.save(checkpoint, "topic_max_f1_topic.pt")

    if (epoch + 1) % 1 == 0:
        print(f'Epoch {epoch + 1}, train_acc_sentiment: {train_acc_sentiment}, train_acc_topic: {train_acc_topic}, test_acc_sentiment: {test_acc_sentiment}, f1_sentiment: {f1_sentiment}, test_acc_topic: {test_acc_topic}, f1_topic: {f1_topic}, epoch_time: {datetime.now() - start_epoch}')

print('total training time: ', datetime.now() - start_train)

0 tensor(50.) tensor(18.7500) tensor(2.4943, device='cuda:0', grad_fn=<AddBackward0>)
50 tensor(42.2794) tensor(29.9020) tensor(2.4250, device='cuda:0', grad_fn=<AddBackward0>)
100 tensor(55.6312) tensor(48.8861) tensor(1.7111, device='cuda:0', grad_fn=<AddBackward0>)
150 tensor(62.1689) tensor(63.1209) tensor(1.5175, device='cuda:0', grad_fn=<AddBackward0>)
200 tensor(66.1070) tensor(70.5224) tensor(1.2617, device='cuda:0', grad_fn=<AddBackward0>)
250 tensor(68.3267) tensor(75.0249) tensor(1.4567, device='cuda:0', grad_fn=<AddBackward0>)
300 tensor(70.6395) tensor(78.1561) tensor(1.3091, device='cuda:0', grad_fn=<AddBackward0>)
350 tensor(71.9551) tensor(80.0570) tensor(1.3128, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(73.4570) tensor(81.8111) tensor(1.5250, device='cuda:0', grad_fn=<AddBackward0>)
450 tensor(74.4041) tensor(83.1901) tensor(1.2635, device='cuda:0', grad_fn=<AddBackward0>)
500 tensor(75.2745) tensor(84.2565) tensor(1.4648, device='cuda:0', grad_fn=<AddBackwar

## Eval

In [None]:
test_acc = 0
len_test = 0

model.load_state_dict(torch.load("topic_max_f1_sentiment.pt")['model'])
test_labels = []
test_predicts = []

for batch_idx, (input_ids, attention_mask, label, _) in enumerate(test_dataloader):
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 3).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    predict, _ = model(input_ids, attention_mask)
                    loss = criterion(predict, label)

                    predict = predict.argmax(-1)
                    test_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_test += len(label)

                    test_labels += label.argmax(-1).cpu().numpy().tolist()
                    test_predicts += predict.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc / len_test * 100, loss)

test_acc = test_acc / len_test
f1 = f1_score(test_predicts, test_labels, average = 'macro')

print(test_acc, f1)

0 tensor(100.) tensor(0.5031, device='cuda:0')
100 tensor(89.6658) tensor(0.5620, device='cuda:0')
tensor(0.8959) 0.895150525123625


In [None]:
## label smoothing 0.2
print(sklearn.metrics.classification_report(test_labels, test_predicts, digits=4))

              precision    recall  f1-score   support

           0     0.8502    0.8765    0.8631       680
           1     0.8576    0.8269    0.8419       670
           2     0.9782    0.9825    0.9804       686

    accuracy                         0.8959      2036
   macro avg     0.8953    0.8953    0.8952      2036
weighted avg     0.8958    0.8959    0.8957      2036



In [None]:
test_acc = 0
len_test = 0

model.load_state_dict(torch.load("topic_max_f1_topic.pt")['model'])
test_labels = []
test_predicts = []

for batch_idx, (input_ids, attention_mask, _, label) in enumerate(test_dataloader):
            input_ids, attention_mask, label = input_ids.cuda(), attention_mask.cuda(), label.cuda()
            label = F.one_hot(label.reshape(-1), 4).float()

            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    _, predict = model(input_ids, attention_mask)
                    loss = criterion(predict, label)

                    predict = predict.argmax(-1)
                    test_acc += (predict == label.argmax(-1)).sum().cpu()
                    len_test += len(label)

                    test_labels += label.argmax(-1).cpu().numpy().tolist()
                    test_predicts += predict.cpu().numpy().tolist()

            if batch_idx % 100 == 0:
                print(batch_idx, test_acc / len_test * 100, loss)

test_acc = test_acc / len_test
f1 = f1_score(test_predicts, test_labels, average = 'macro')

print(test_acc, f1)

0 tensor(81.2500) tensor(0.7928, device='cuda:0')
100 tensor(96.6584) tensor(0.6034, device='cuda:0')
tensor(0.9646) 0.9648880657937953


In [None]:
## label smoothing 0.2
print(sklearn.metrics.classification_report(test_labels, test_predicts, digits=4))

              precision    recall  f1-score   support

           0     0.9478    0.9658    0.9567       526
           1     0.9766    0.9728    0.9747       515
           2     0.9438    0.9270    0.9353       507
           3     0.9918    0.9939    0.9928       488

    accuracy                         0.9646      2036
   macro avg     0.9650    0.9649    0.9649      2036
weighted avg     0.9646    0.9646    0.9646      2036

