text,label
"Almost done! just reviewing :)",2
"That is unacceptable, fix it now!",0
"Ok",1
...


In [None]:
# Full advanced code (BERT fine-tune with custom loss option & metrics)
"""
Advanced BERT fine-tuning for WhatsApp sentiment classification.
Supports:
 - Hugging Face Trainer-based training
 - Weighted loss or Focal Loss for class imbalance
 - Stratified train/val split
 - Evaluation metrics (precision, recall, f1)
 - Inference helper
"""

import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
import torch
from torch import nn
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from datasets import Dataset, DatasetDict, load_metric

# -----------------------
# Config
# -----------------------
MODEL_NAME = "distilbert-base-uncased"  # swap for "bert-base-uncased" or "roberta-base"
NUM_LABELS = 3  # negative, neutral, positive
BATCH_SIZE = 16
EPOCHS = 4
LR = 2e-5
MAX_LEN = 128
SEED = 42
OUTPUT_DIR = "./whatsapp_bert_finetuned"
USE_FOCAL_LOSS = False  # set True to use focal loss (helpful for imbalance)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -----------------------
# Utilities
# -----------------------
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything()

# -----------------------
# Load CSV and prep Dataset
# -----------------------
df = pd.read_csv("whatsapp_sentiment.csv")  # expects columns text,label
# quick cleaning suggestion (you can expand)
df['text'] = df['text'].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

train_df, val_df = train_test_split(df, test_size=0.12, stratify=df['label'], random_state=SEED)

hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
hf_val = Dataset.from_pandas(val_df.reset_index(drop=True))

dataset = DatasetDict({"train": hf_train, "validation": hf_val})

# -----------------------
# Tokenizer & Data Collator
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

dataset = dataset.map(preprocess, batched=True, remove_columns=["text", "__index_level_0__"], num_proc=1)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# -----------------------
# Compute class weights (for WeightedLoss)
# -----------------------
label_counts = train_df['label'].value_counts().sort_index().values  # order: 0,1,2
total = label_counts.sum()
class_weights = [total / (len(label_counts) * c) if c > 0 else 0.0 for c in label_counts]
class_weights = torch.tensor(class_weights).to(DEVICE)
print("Label counts:", label_counts, "Class weights:", class_weights.cpu().numpy())

# -----------------------
# Model
# -----------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model.to(DEVICE)

# -----------------------
# Custom loss wrapper (if using weighted or focal loss)
# -----------------------
class CustomTrainer(Trainer):
    def __init__(self, use_focal=False, focal_gamma=2.0, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_focal = use_focal
        self.focal_gamma = focal_gamma
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(DEVICE)
        outputs = model(**inputs)
        logits = outputs.get("logits")
        if self.use_focal:
            # Focal loss
            ce = torch.nn.functional.cross_entropy(logits, labels, reduction="none", weight=self.class_weights)
            p_t = torch.exp(-ce)  # p_t = exp(-CE)
            focal = ((1 - p_t) ** self.focal_gamma) * ce
            loss = focal.mean()
        elif self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            loss = torch.nn.functional.cross_entropy(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# -----------------------
# Metrics
# -----------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    report = classification_report(labels, preds, zero_division=0, output_dict=True)
    return {
        "accuracy": acc,
        "precision": p,
        "recall": r,
        "f1": f1,
        "report": report
    }

# -----------------------
# TrainingArguments
# -----------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    logging_steps=50,
    learning_rate=LR,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True if torch.cuda.is_available() else False,
    gradient_accumulation_steps=1,
    save_total_limit=3,
)

# -----------------------
# Instantiate CustomTrainer
# -----------------------
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    use_focal=USE_FOCAL_LOSS,
    focal_gamma=2.0,
    class_weights=class_weights if not USE_FOCAL_LOSS else None
)

# -----------------------
# Train
# -----------------------
trainer.train()

# -----------------------
# Evaluate on validation set
# -----------------------
metrics = trainer.evaluate()
print("Validation metrics:", metrics)

# -----------------------
# Save final model + tokenizer
# -----------------------
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# -----------------------
# Inference helper
# -----------------------
def predict_texts(texts, model_path=OUTPUT_DIR):
    tok = AutoTokenizer.from_pretrained(model_path)
    mod = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)
    enc = tok(texts, truncation=True, padding=True, return_tensors="pt", max_length=MAX_LEN).to(DEVICE)
    with torch.no_grad():
        out = mod(**enc)
        logits = out.logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
        preds = np.argmax(probs, axis=-1)
    return preds, probs

# Example inference:
samples = ["Thanks, that helped a lot ðŸ˜„", "This is unacceptable, fix it now!", "Okay."]
preds, probs = predict_texts(samples)
print(list(zip(samples, preds, probs)))
