## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np


## Load data

In [89]:
df = pd.read_csv('fireant_data/cleaned_posts/all_posts.csv', nrows=200000)
df.head()

Unnamed: 0,postID,originalContent,date,sentiment,taggedSymbols
0,3118126,V√†o NKG ng√†y mai ·ªïn kh√¥ng c√°c b√°c? D√†i h·∫°n v√†o...,2021-09-30,0.0,['NKG']
1,3118104,C√≥ m·ªôt ƒëi·ªÅu th·∫•y bu·ªìn c∆∞·ªùi nh·∫•t l√† nh·ªØng ng∆∞·ªùi...,2021-09-30,0.0,['ART']
2,3118086,L·∫°i l√† ƒêTC,2021-09-30,0.0,['DHA']
3,3118058,"Ch√†o anh em, m√¨nh g·ª≠i anh ch·ªã em chi·∫øn l∆∞·ª£c ƒë√°...",2021-09-30,0.0,[]
4,3118053,CTC v√†o 8.3 c√≥ d√≠nh b√¥ kh√¥ng m·ªçi ng∆∞·ªùi?,2021-09-30,0.0,['CTC']


## Keep posts with negative and positive sentiment only

In [90]:
sentiment_df = df[df['sentiment'].isin([-1, 1])]
sentiment_df.head()

Unnamed: 0,postID,originalContent,date,sentiment,taggedSymbols
11,3117847,N√äN MUA G√å CHO B√ÅO C√ÅO QU√ù 3?\n\nTh√°ng 10 l√† g...,2021-09-30,1.0,"['DCM', 'DPM', 'GMD', 'HPG', 'KSB', 'NTL', 'PV..."
29,3117591,D·ª± √°n Akari City v·ªõi quy m√¥ 5.000 cƒÉn h·ªô n·∫±m t...,2021-09-30,1.0,['NLG']
31,3117571,"PNJ - H√ÄNH TR√åNH M·ªöI üî•üî•\n\nPNJ + 5,79% üî•\n\nT·∫°...",2021-09-30,1.0,"['MSN', 'MWG', 'PC1', 'PNJ']"
48,3117327,Ti·∫øc qu√° kh√¥ng CE\nNay ƒÉn ƒë∆∞·ª£c PNJ em vui qu√° ...,2021-09-30,1.0,['PNJ']
56,3117230,Gi√° kh√≠ ƒë·ªët l·∫°i tƒÉng d·ª±ng ƒë·ª©ng mai l·∫°i tr·∫ßn ti...,2021-09-30,1.0,['ASP']


## Data Cleaning

In [91]:
import re

def remove_links(text):
    # X√≥a link d·∫°ng http(s)://... ho·∫∑c www....
    text = re.sub(r"http\S+", "", text)     # remove http:// ho·∫∑c https://
    text = re.sub(r"www\.\S+", "", text)    # remove www...
    text = re.sub(r"\S+\.com\S*", "", text) # remove .com/.net/.vn...
    return text.strip()

def clean_text(text):
    text = str(text).lower()                          # lowercase
    text = re.sub(r"\n+", ". ", text)                # replace new line with period
    text = remove_links(text)                         # remove links
    text = re.sub(r"@\w+", "", text)                  # remove mentions (@abc)
    text = re.sub(r"#\w+", "", text)                  # remove hashtags
    text = re.sub(
        r"[^0-9a-zA-Z√°√†·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠√©√®·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá"
        r"√≠√¨·ªâƒ©·ªã√≥√≤·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√∫√π·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±"
        r"√Ω·ª≥·ª∑·ªπ·ªµƒë\s!?,.]+", 
        " ", 
        text
    )            # keep only letters, numbers and some punctuation
    text = re.sub(r"\s+", " ", text).strip()          # remove extra spaces
    return text


### Remove links in posts content

In [92]:
sentiment_df['originalContent'] = sentiment_df['originalContent'].apply(clean_text)
sentiment_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment_df['originalContent'] = sentiment_df['originalContent'].apply(clean_text)


Unnamed: 0,postID,originalContent,date,sentiment,taggedSymbols
11,3117847,n√™n mua g√¨ cho b√°o c√°o qu√Ω 3?. th√°ng 10 l√† gia...,2021-09-30,1.0,"['DCM', 'DPM', 'GMD', 'HPG', 'KSB', 'NTL', 'PV..."
29,3117591,d·ª± √°n akari city v·ªõi quy m√¥ 5.000 cƒÉn h·ªô n·∫±m t...,2021-09-30,1.0,['NLG']
31,3117571,"pnj h√†nh tr√¨nh m·ªõi . pnj 5,79 . t·∫°i sao n√≥i h√†...",2021-09-30,1.0,"['MSN', 'MWG', 'PC1', 'PNJ']"
48,3117327,ti·∫øc qu√° kh√¥ng ce. nay ƒÉn ƒë∆∞·ª£c pnj em vui qu√° ...,2021-09-30,1.0,['PNJ']
56,3117230,gi√° kh√≠ ƒë·ªët l·∫°i tƒÉng d·ª±ng ƒë·ª©ng mai l·∫°i tr·∫ßn ti...,2021-09-30,1.0,['ASP']


## Data Preparation

In [93]:
from transformers import AutoTokenizer 
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


In [94]:
import torch
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=100, mode="train",test_size=0.2, random_state=42):
        # keep only necessary columns
        df = df[['originalContent', 'sentiment']]
        
        # train/test split
        train_df, test_df = train_test_split(
            df, 
            test_size=test_size,
            stratify=df['sentiment'],
            random_state=random_state
        )
        
        self.df = train_df if mode == "train" else test_df 
        
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row['originalContent']
        label = 1 if row['sentiment'] == 1 else 0
        
        tokens = self.tokenizer(
            text, 
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            "input_ids": tokens['input_ids'].squeeze(0),
            "attention_mask": tokens['attention_mask'].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
            }

In [95]:
from torch.utils.data import WeightedRandomSampler
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vietnamese-Sentiment-visobert")

# Create dataset
train_dataset = SentimentDataset(sentiment_df, tokenizer, mode='train')
test_dataset = SentimentDataset(sentiment_df, tokenizer, mode='test')

# compute class weights
labels = train_dataset.df['sentiment'].map({-1:0, 1:1}).values
class_counts = [sum(labels==0), sum(labels==1)] # [neg_count, pos_count]
class_weights = [1.0 / count for count in class_counts]

# assign weight to each sample
sample_weights = [class_weights[label] for label in labels]

# create WeightedRandomSampler
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True # allow sampling the same example multiple times
)

# create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# check a batch
batch = next(iter(train_loader))
print(batch['input_ids'].shape)  # (batch_size, seq_len)
print(batch['labels'])

torch.Size([32, 100])
tensor([0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 1, 0, 1, 1, 0, 0])


In [96]:
# check a batch
batch = next(iter(train_loader))
print(batch['input_ids'].shape)  # (batch_size, seq_len)
print(batch['labels'])
print(batch['labels'].bincount()) 

torch.Size([32, 100])
tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
        0, 1, 1, 0, 0, 0, 0, 1])
tensor([14, 18])


## Load pretrained model

In [97]:
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaClassificationHead
from transformers import AutoModelForSequenceClassification, AutoConfig
model_name = "5CD-AI/Vietnamese-Sentiment-visobert"

tokenizer = AutoTokenizer.from_pretrained(model_name)


# 1. Load config
config = AutoConfig.from_pretrained("5CD-AI/Vietnamese-Sentiment-visobert")
config.num_labels = 2  # important

# 2. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True   # avoid shape errors
)



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at 5CD-AI/Vietnamese-Sentiment-visobert and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [98]:
def freeze_model_layers(model, unfreeze_last_n=2):
    """
    Freeze all but the last `n` transformer layers of the encoder,
    plus keep the classifier trainable.
    """
    # 1. Freeze embeddings
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = False

    # 2. Freeze all encoder layers except the last `unfreeze_last_n`
    for i, layer in enumerate(model.roberta.encoder.layer):
        if i < len(model.roberta.encoder.layer) - unfreeze_last_n:
            for param in layer.parameters():
                param.requires_grad = False
        else:
            for param in layer.parameters():
                param.requires_grad = True

    # 3. Always train classifier head
    for param in model.classifier.parameters():
        param.requires_grad = True

    # Print summary
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M total "
          f"({100 * trainable_params/total_params:.1f}%)")

    return model


In [99]:
freeze_model_layers(model)

Trainable params: 14.77M / 97.57M total (15.1%)


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(15004, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=7

## Training and Evaluating Functions

In [100]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Evaluation function
def evaluate_model(model, data_loader, device='cuda'):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with autocast('cuda'):  # Add this for mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits 
            
            batch_preds = torch.argmax(logits, dim=1)
            preds.extend(batch_preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)
    precision = precision_score(true_labels, preds)
    recall = recall_score(true_labels, preds)

    print(f"Eval | Acc: {acc:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")
    return acc, f1, precision, recall

# Training function
def train_model(model, train_loader, val_loader=None,
                epochs=3, lr=2e-5,weight_decay=0.01, 
                warmup_ratio=0.1, max_grad_norm=1.0, device='cuda'):
    
    model.to(device)
    
    # optimizer + weight decay
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    # LR Scheduler with warmup
    total_steps = len(train_loader) * epochs
    warmup_steps = int(total_steps * warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    scaler = GradScaler('cuda')  # mixed precision scaler
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            with autocast('cuda'):
                outputs = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()

            # Gradient clipping
            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_grad_norm)

            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} | Train loss: {avg_loss:.4f}")

        # Evaluate after each epoch
        if val_loader is not None:
            evaluate_model(model, val_loader, device=device)

    print("Training complete!")


In [None]:
train_model(model, train_loader, val_loader=test_loader, epochs=5)

Training Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 732/732 [20:32<00:00,  1.68s/it]


Epoch 1 | Train loss: 0.5516


                                                             

Eval | Acc: 0.7789 | F1: 0.8518 | Precision: 0.9086 | Recall: 0.8017


Training Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 732/732 [16:06<00:00,  1.32s/it]


Epoch 2 | Train loss: 0.4712


                                                             

Eval | Acc: 0.7567 | F1: 0.8303 | Precision: 0.9288 | Recall: 0.7506


Training Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 732/732 [15:07<00:00,  1.24s/it]


Epoch 3 | Train loss: 0.4277


                                                             

Eval | Acc: 0.7497 | F1: 0.8231 | Precision: 0.9360 | Recall: 0.7345


Training Epoch 4:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 674/732 [13:12<01:34,  1.63s/it]