In [1]:
# Cài thư viện cần (dựa trên code trong repo)
!pip install transformers torch datasets accelerate



In [2]:
# Kiểm tra GPU có sẵn không
import torch
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

# Cài lại các thư viện cần (nếu chưa chạy trước đó)
!pip install transformers torch datasets accelerate pandas scikit-learn tqdm


GPU available: True
GPU name: Tesla P100-PCIE-16GB


In [3]:
# Đường dẫn đến file trong folder WevMining (thay tên file nếu khác)
file_path = "/kaggle/input/dataset/dataset (1).csv"  # hoặc .parquet nếu là Parquet

import pandas as pd
import re

# Load file
df = pd.read_csv(file_path)

print("Cột gốc trong file:", df.columns.tolist())

# Rename cột
df = df.rename(columns={
    'text': 'body',
    'A': 'agr',
    'O': 'ope',
    'C': 'con',
    'E': 'ext',
    'N': 'neu'
})

# LIGHT CLEAN - chỉ xóa URL và normalize space, giữ emoji, punctuation, caps
def light_clean(text):
    if not isinstance(text, str):
        text = str(text)
    # Thay URL bằng placeholder (RoBERTa-large hiểu tốt [URL])
    text = re.sub(r'http\S+|www\S+', '[URL]', text)
    # Normalize space, loại bỏ khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['body'] = df['body'].apply(light_clean)

# Sample nhỏ để test nhanh (tăng lên 0.1 hoặc full khi ổn)
df = df.sample(frac=0.1, random_state=42)  # ~34k mẫu

# Normalize labels về 0-1
df[['ext', 'neu', 'agr', 'con', 'ope']] = df[['ext', 'neu', 'agr', 'con', 'ope']]/100

print("\nCột sau khi rename & light clean:", df.columns.tolist())
print("\nSample 3 dòng đầu (giữ nguyên emoji/punctuation):")
print(df.head(3))

Cột gốc trong file: ['ope', 'con', 'ext', 'agr', 'neu', 'ptype', 'body', '__index_level_0__']

Cột sau khi rename & light clean: ['ope', 'con', 'ext', 'agr', 'neu', 'ptype', 'body', '__index_level_0__']

Sample 3 dòng đầu (giữ nguyên emoji/punctuation):
          ope   con   ext   agr   neu  ptype  \
258181   0.02  0.38  0.87  0.28  0.01      4   
768973   0.81  0.80  0.30  0.24  0.75     25   
1732080  0.29  0.40  0.06  0.07  0.98      1   

                                                      body  __index_level_0__  
258181   People who know they're close to zero balance ...            1386091  
768973                        Aww You are most welcome. =]            2044374  
1732080            Oh, didn't know it blocked her ult. TIL            1512060  


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.optim import AdamW
from tqdm import tqdm
import os
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# max_length hợp lý cho RoBERTa-large
max_length = 96

def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors=None
    )

df['body'] = df['body'].apply(tokenize_text)

print("Tokenization hoàn tất. Shape:", df.shape)

2026-01-26 10:59:53.567729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769425193.758682      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769425193.813588      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769425194.265248      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769425194.265304      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769425194.265307      55 computation_placer.cc:177] computation placer alr

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Tokenization hoàn tất. Shape: (192420, 8)


In [5]:
# Split train/val/test
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

# Custom Dataset
class PersonalityDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = self.tokenized_texts[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Tham số batch
batch_size = 16
accumulation_steps = 4

train_dataset = PersonalityDataset(df_train['body'].tolist(), df_train[['ext', 'neu', 'agr', 'con', 'ope']].values)
val_dataset   = PersonalityDataset(df_val['body'].tolist(),   df_val[['ext', 'neu', 'agr', 'con', 'ope']].values)
test_dataset  = PersonalityDataset(df_test['body'].tolist(),  df_test[['ext', 'neu', 'agr', 'con', 'ope']].values)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=2)

print(f"Dataset sizes → Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

Dataset sizes → Train: 134694 | Val: 28863 | Test: 28863


In [6]:
from transformers import RobertaModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch

class RoBERTaForPersonalityTraits(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-large')
       
        # Freeze bottom layers
        for param in self.roberta.embeddings.parameters():
            param.requires_grad = False
        for i in range(12):
            for param in self.roberta.encoder.layer[i].parameters():
                param.requires_grad = False
       
        self.dropout = torch.nn.Dropout(0.1)
        self.head = torch.nn.Sequential(
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(512, 5)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output
        output = self.dropout(pooled)
        return self.head(output)

model = RoBERTaForPersonalityTraits()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-5,
    weight_decay=0.01
)

loss_fn = torch.nn.MSELoss()

# Thêm scheduler
epochs = 12  # Đồng bộ với Cell 7
total_steps = len(train_loader) * epochs
warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("Model loaded. Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))
print(f"Scheduler ready → Total steps: {total_steps:,} | Warmup: {warmup_steps:,}")

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded. Trainable params: 152731653
Scheduler ready → Total steps: 101,028 | Warmup: 10,102


In [7]:
# ────────────────────────────────────────────────────────────────
# Cell 7: Training Loop với Mixed Precision (FP16) + Early Stopping
# ────────────────────────────────────────────────────────────────

from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import torch

# Tham số training
epochs = 12               # Số epochs (có thể tăng nếu cần)
patience = 5              # Số epochs chờ nếu val loss không cải thiện
best_val_loss = float('inf')
patience_counter = 0

scaler = GradScaler()     # Mixed precision scaler

print(f"Bắt đầu training với {epochs} epochs, batch size hiệu quả = {batch_size * accumulation_steps}")

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    optimizer.zero_grad()
    
    loop = tqdm(train_loader, leave=True, desc=f"Epoch {epoch+1}/{epochs}")
    for i, batch in enumerate(loop):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        with autocast():
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, targets) / accumulation_steps
        
        scaler.scale(loss).backward()
        
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            # Nếu bạn đã thêm scheduler ở Cell 6, gọi ở đây:
            scheduler.step()   # Bỏ comment nếu có scheduler
            
        total_train_loss += loss.item() * accumulation_steps
        
        # Cập nhật progress bar
        loop.set_postfix({'batch_loss': loss.item() * accumulation_steps})
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    # ────────────────────────────────────────────────────────────────
    # Validation
    # ────────────────────────────────────────────────────────────────
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, targets)
            
            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{epochs} → Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    # Early stopping & save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_roberta.pth')
        print("→ Saved best model (val loss improved)")
    else:
        patience_counter += 1
        print(f"Val loss không cải thiện ({patience_counter}/{patience})")
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

print("\nTraining hoàn tất.")
print(f"Best validation loss: {best_val_loss:.4f}")

  scaler = GradScaler()     # Mixed precision scaler


Bắt đầu training với 12 epochs, batch size hiệu quả = 64


  with autocast():
Epoch 1/12: 100%|██████████| 8419/8419 [42:43<00:00,  3.28it/s, batch_loss=0.081] 
  with autocast():


Epoch 1/12 → Train Loss: 0.1194 | Val Loss: 0.0878
→ Saved best model (val loss improved)


Epoch 2/12: 100%|██████████| 8419/8419 [42:38<00:00,  3.29it/s, batch_loss=0.112] 


Epoch 2/12 → Train Loss: 0.0877 | Val Loss: 0.0843
→ Saved best model (val loss improved)


Epoch 3/12: 100%|██████████| 8419/8419 [42:38<00:00,  3.29it/s, batch_loss=0.11]  


Epoch 3/12 → Train Loss: 0.0849 | Val Loss: 0.0824
→ Saved best model (val loss improved)


Epoch 4/12: 100%|██████████| 8419/8419 [42:27<00:00,  3.31it/s, batch_loss=0.0785]


Epoch 4/12 → Train Loss: 0.0823 | Val Loss: 0.0810
→ Saved best model (val loss improved)


Epoch 5/12: 100%|██████████| 8419/8419 [42:33<00:00,  3.30it/s, batch_loss=0.0755]


Epoch 5/12 → Train Loss: 0.0797 | Val Loss: 0.0806
→ Saved best model (val loss improved)


Epoch 6/12: 100%|██████████| 8419/8419 [42:34<00:00,  3.30it/s, batch_loss=0.121] 


Epoch 6/12 → Train Loss: 0.0766 | Val Loss: 0.0797
→ Saved best model (val loss improved)


Epoch 7/12: 100%|██████████| 8419/8419 [42:34<00:00,  3.30it/s, batch_loss=0.0612]


Epoch 7/12 → Train Loss: 0.0730 | Val Loss: 0.0797
→ Saved best model (val loss improved)


Epoch 8/12: 100%|██████████| 8419/8419 [42:29<00:00,  3.30it/s, batch_loss=0.0547]


Epoch 8/12 → Train Loss: 0.0691 | Val Loss: 0.0810
Val loss không cải thiện (1/5)


Epoch 9/12: 100%|██████████| 8419/8419 [42:25<00:00,  3.31it/s, batch_loss=0.0651]


Epoch 9/12 → Train Loss: 0.0651 | Val Loss: 0.0812
Val loss không cải thiện (2/5)


Epoch 10/12: 100%|██████████| 8419/8419 [42:26<00:00,  3.31it/s, batch_loss=0.0432]


Epoch 10/12 → Train Loss: 0.0613 | Val Loss: 0.0831
Val loss không cải thiện (3/5)


Epoch 11/12:  26%|██▌       | 2159/8419 [10:53<31:34,  3.30it/s, batch_loss=0.053] 


KeyboardInterrupt: 

In [8]:
model.load_state_dict(torch.load('best_roberta.pth'))
model.eval()

all_preds, all_targets = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].cpu().numpy()
        
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_targets.append(targets)

preds = np.concatenate(all_preds)
targets = np.concatenate(all_targets)

traits = ['Ext', 'Neu', 'Agr', 'Con', 'Ope']
for i, trait in enumerate(traits):
    rmse = np.sqrt(mean_squared_error(targets[:, i], preds[:, i]))
    mae = mean_absolute_error(targets[:, i], preds[:, i])
    r2 = r2_score(targets[:, i], preds[:, i])
    print(f"{trait}: RMSE = {rmse:.4f} | MAE = {mae:.4f} | R² = {r2:.4f}")

overall_rmse = np.sqrt(mean_squared_error(targets, preds))
overall_r2 = r2_score(targets, preds, multioutput='uniform_average')
print(f"\nTổng thể: RMSE = {overall_rmse:.4f} | R² = {overall_r2:.4f}")

Ext: RMSE = 0.2815 | MAE = 0.2284 | R² = 0.0941
Neu: RMSE = 0.2958 | MAE = 0.2454 | R² = 0.0933
Agr: RMSE = 0.2801 | MAE = 0.2281 | R² = 0.1306
Con: RMSE = 0.2512 | MAE = 0.2027 | R² = 0.1003
Ope: RMSE = 0.2965 | MAE = 0.2487 | R² = 0.0950

Tổng thể: RMSE = 0.2815 | R² = 0.1027
