## Import necessary libraries

In [None]:
import pandas as pd
import numpy as np


## Load data

In [None]:
df = pd.read_csv('fireant_data/cleaned_posts/all_posts.csv', nrows=1000000)
df.head()

## Keep posts with negative and positive sentiment only

In [None]:
sentiment_df = df[df['sentiment'].isin([-1, 1])]
sentiment_df.head()

## Data Cleaning

In [None]:
import re

def remove_links(text):
    # Xóa link dạng http(s)://... hoặc www....
    text = re.sub(r"http\S+", "", text)     # remove http:// hoặc https://
    text = re.sub(r"www\.\S+", "", text)    # remove www...
    text = re.sub(r"\S+\.com\S*", "", text) # remove .com/.net/.vn...
    return text.strip()

def clean_text(text):
    text = str(text).lower()                          # lowercase
    text = re.sub(r"\n+", ". ", text)                # replace new line with period
    text = remove_links(text)                         # remove links
    text = re.sub(r"@\w+", "", text)                  # remove mentions (@abc)
    text = re.sub(r"#\w+", "", text)                  # remove hashtags
    text = re.sub(
        r"[^0-9a-zA-Záàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệ"
        r"íìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữự"
        r"ýỳỷỹỵđ\s!?,.]+", 
        " ", 
        text
    )            # keep only letters, numbers and some punctuation
    text = re.sub(r"\s+", " ", text).strip()          # remove extra spaces
    return text


### Remove links in posts content

In [None]:
sentiment_df['originalContent'] = sentiment_df['originalContent'].apply(clean_text)
sentiment_df.head()


## Data Preparation

In [None]:
from transformers import AutoTokenizer 
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


In [None]:
import torch
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=100, mode="train",test_size=0.2, random_state=42):
        # keep only necessary columns
        df = df[['originalContent', 'sentiment']]
        
        # train/test split
        train_df, test_df = train_test_split(
            df, 
            test_size=test_size,
            stratify=df['sentiment'],
            random_state=random_state
        )
        
        self.df = train_df if mode == "train" else test_df 
        
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row['originalContent']
        label = 1 if row['sentiment'] == 1 else 0
        
        tokens = self.tokenizer(
            text, 
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            "input_ids": tokens['input_ids'].squeeze(0),
            "attention_mask": tokens['attention_mask'].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
            }

In [None]:
from torch.utils.data import WeightedRandomSampler
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vietnamese-Sentiment-visobert")

# Create dataset
train_dataset = SentimentDataset(sentiment_df, tokenizer, mode='train')
test_dataset = SentimentDataset(sentiment_df, tokenizer, mode='test')

# compute class weights
labels = train_dataset.df['sentiment'].map({-1:0, 1:1}).values
class_counts = [sum(labels==0), sum(labels==1)] # [neg_count, pos_count]
class_weights = [1.0 / count for count in class_counts]

# assign weight to each sample
sample_weights = [class_weights[label] for label in labels]

# create WeightedRandomSampler
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True # allow sampling the same example multiple times
)

# create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# check a batch
batch = next(iter(train_loader))
print(batch['input_ids'].shape)  # (batch_size, seq_len)
print(batch['labels'])

In [None]:
# check a batch
batch = next(iter(train_loader))
print(batch['input_ids'].shape)  # (batch_size, seq_len)
print(batch['labels'])
print(batch['labels'].bincount()) 

## Load pretrained model

In [None]:
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaClassificationHead
from transformers import AutoModelForSequenceClassification, AutoConfig
model_name = "5CD-AI/Vietnamese-Sentiment-visobert"

tokenizer = AutoTokenizer.from_pretrained(model_name)


# 1. Load config
config = AutoConfig.from_pretrained("5CD-AI/Vietnamese-Sentiment-visobert")
config.num_labels = 2  # important

# 2. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True   # avoid shape errors
)


In [None]:
def freeze_model_layers(model, unfreeze_last_n=4):
    """
    Freeze all but the last `n` transformer layers of the encoder,
    plus keep the classifier trainable.
    """
    # 1. Freeze embeddings
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = False

    # 2. Freeze all encoder layers except the last `unfreeze_last_n`
    for i, layer in enumerate(model.roberta.encoder.layer):
        if i < len(model.roberta.encoder.layer) - unfreeze_last_n:
            for param in layer.parameters():
                param.requires_grad = False
        else:
            for param in layer.parameters():
                param.requires_grad = True

    # 3. Always train classifier head
    for param in model.classifier.parameters():
        param.requires_grad = True

    # Print summary
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M total "
          f"({100 * trainable_params/total_params:.1f}%)")

    return model


In [None]:
freeze_model_layers(model)

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Evaluation function
def evaluate_model(model, data_loader, device='cuda'):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with autocast('cuda'):  # Add this for mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits 
            
            batch_preds = torch.argmax(logits, dim=1)
            preds.extend(batch_preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)
    precision = precision_score(true_labels, preds)
    recall = recall_score(true_labels, preds)

    print(f"Eval | Acc: {acc:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")
    return acc, f1, precision, recall

def train_model(model, train_loader, val_loader=None,
                epochs=3, lr=2e-5, weight_decay=0.01,
                warmup_ratio=0.1, max_grad_norm=1.0, device='cuda',
                save_dir="./checkpoints", save_every_n_epochs=2):
    
    os.makedirs(save_dir, exist_ok=True)
    model.to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    total_steps = len(train_loader) * epochs
    warmup_steps = int(total_steps * warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            with autocast(device_type='cuda'):
                outputs = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = outputs.loss
                
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} | Train loss: {avg_loss:.4f}")
        
        # Evaluate
        if val_loader is not None:
            evaluate_model(model, val_loader, device=device)
        
        # === SAVE EVERY `save_every_n_epochs` EPOCHS ===
        if (epoch + 1) % save_every_n_epochs == 0:
            save_path = os.path.join(save_dir, f"model_epoch_{epoch+1}")
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print(f"Checkpoint saved: {save_path}")

    # Save final model
    final_path = os.path.join(save_dir, "model_final")
    model.save_pretrained(final_path)
    tokenizer.save_pretrained(final_path)
    print(f"Final model saved: {final_path}")
    
    print("Training complete!")

In [None]:
train_model(model, train_loader, val_loader=test_loader, epochs=10)

## Apply the Model for Neutral Sentiment Posts

In [None]:
import torch
from torch.amp import autocast
from tqdm import tqdm
import pandas as pd

@torch.no_grad()
def analyze_zero_sentiment_posts(
    df,
    model,
    tokenizer,
    text_column='originalContent',
    sentiment_column='sentiment',
    batch_size=32,
    device='cuda',
    label_map={0: -1, 1: 1}  # 0 in logits → negative, 1 → positive
):
    """
    Predict sentiment for rows where `sentiment == 0` (unknown/neutral).
    
    Args:
        df (pd.DataFrame): Input dataframe
        model: Trained HuggingFace model (on GPU/CPU)
        tokenizer: Corresponding tokenizer
        text_column: Name of column with cleaned text
        sentiment_column: Name of column with labels (0, 1, -1)
        batch_size: Inference batch size
        device: 'cuda' or 'cpu'
        label_map: How to map logit index → actual label
    
    Returns:
        df_with_pred: DataFrame with new column `predicted_sentiment`
    """
    # Filter rows with sentiment == 0
    zero_df = df[df[sentiment_column] == 0].copy()
    if zero_df.empty:
        print("No rows with sentiment == 0 found.")
        df['predicted_sentiment'] = None
        return df

    print(f"Analyzing {len(zero_df)} posts with sentiment == 0...")

    model.eval()
    model.to(device)

    # Tokenize all texts
    texts = zero_df[text_column].tolist()
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Batch inference
    dataset = torch.utils.data.TensorDataset(input_ids, attention_mask)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    predictions = []
    for batch in tqdm(loader, desc="Predicting", leave=False):
        batch_ids, batch_mask = batch
        with autocast(device_type='cuda' if 'cuda' in device else 'cpu'):
            outputs = model(input_ids=batch_ids, attention_mask=batch_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

    # Map predictions: 0 → -1, 1 → 1
    pred_labels = [label_map[p] for p in predictions]

    # Add to original zero_df
    zero_df = zero_df.copy()
    zero_df['predicted_sentiment'] = pred_labels

    # Merge back into original df
    df_with_pred = df.copy()
    df_with_pred = df_with_pred.merge(
        zero_df[['predicted_sentiment']],
        left_index=True,
        right_index=True,
        how='left'
    )

    # Fill NaN (non-zero sentiment rows) with None or keep original
    df_with_pred['predicted_sentiment'] = df_with_pred['predicted_sentiment'].where(
        df[sentiment_column] == 0, None
    )

    # Summary
    pos = sum(p == 1 for p in pred_labels)
    neg = sum(p == -1 for p in pred_labels)
    print(f"\nResults for sentiment == 0 posts:")
    print(f"  Positive: {pos} ({pos/len(pred_labels)*100:.1f}%)")
    print(f"  Negative: {neg} ({neg/len(pred_labels)*100:.1f}%)")

    return df_with_pred

In [None]:
new_df = df.copy()
new_df['originalContent'] = new_df['originalContent'].apply(clean_text)
new_df.head()

In [None]:
new_df = analyze_zero_sentiment_posts(
    df=new_df,           # original dataframe
    model=model,               # trained model
    tokenizer=tokenizer,       # same tokenizer
    text_column='originalContent',
    sentiment_column='sentiment',
    batch_size=32,
    device='cuda'
)

new_df.head()

In [None]:
new_df.to_csv('posts_sentiment.csv', index=False)