In [None]:
!pip install transformers scikit-learn pandas nltk tqdm torch torchvision tabulate --quiet

import torch, pandas as pd, numpy as np, nltk
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from tabulate import tabulate

nltk.download("punkt")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# ============================================================
# 1. PREPROCESSING (title + description ONLY)
# ============================================================

DATA_FILE = "/content/polygon_news_1.csv"
df = pd.read_csv(DATA_FILE)

required_cols = ["title", "description","keywords", "sentiment"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' missing. Available: {df.columns.tolist()}")

df["text"] = df["title"].astype(str) + " " + df["description"].astype(str)
df = df[["text", "sentiment"]].dropna()
df = df.rename(columns={"sentiment": "label"})

# Encode labels
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])
LABELS = list(le.classes_)
print("Initial Classes:", LABELS)
print(df["label"].value_counts())

# ============================================================
# 2. REMOVE RARE CLASSES (<2 samples)
# ============================================================

counts = df["label_id"].value_counts()
valid_ids = counts[counts >= 2].index.tolist()
df = df[df["label_id"].isin(valid_ids)].reset_index(drop=True)

# Refit encoder after removing rare classes
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])
LABELS = list(le.classes_)

print("\nAfter removing rare classes:")
print("Classes:", LABELS)
print(df["label"].value_counts())

# ============================================================
# 3. SAFE 80/10/10 SPLIT
# ============================================================

train_df, temp_df = train_test_split(
    df, test_size=0.20, stratify=df["label_id"], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label_id"], random_state=42
)

print(f"\nTrain={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

# ============================================================
# 4. LOAD ROBERTA
# ============================================================

ROBERTA_MODEL = "roberta-base"

print("\nðŸ”¹ Loading RoBERTa...")
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL)
roberta = AutoModel.from_pretrained(ROBERTA_MODEL).to(DEVICE)
roberta.eval()

EMBED_DIM = roberta.config.hidden_size  # usually 768
print("RoBERTa Embedding Dim:", EMBED_DIM)

# ============================================================
# 5. Dataset + Collate
# ============================================================

class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["label_id"].tolist()
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx): return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)

    enc = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    input_ids = enc["input_ids"].to(DEVICE)
    attn_mask = enc["attention_mask"].to(DEVICE)

    with torch.no_grad():
        outputs = roberta(input_ids, attention_mask=attn_mask)
        emb = outputs.last_hidden_state.mean(dim=1)

    labels = torch.tensor(labels, dtype=torch.long).to(DEVICE)
    return emb, labels

train_loader = DataLoader(TextDataset(train_df), batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(TextDataset(val_df),   batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(TextDataset(test_df),  batch_size=16, shuffle=False, collate_fn=collate_fn)

# ============================================================
# 6. SIMPLE LINEAR CLASSIFIER (NO LSTM)
# ============================================================

class RobertaClassifier(nn.Module):
    def __init__(self, input_dim, output_dim, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, output_dim)
        )
    def forward(self, x):
        return self.model(x)

model = RobertaClassifier(EMBED_DIM, len(LABELS)).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ============================================================
# 7. EVALUATION FUNCTION
# ============================================================

def evaluate(loader):
    model.eval()
    total_loss, all_y, all_pred = 0, [], []

    with torch.no_grad():
        for X, y in loader:
            out = model(X)
            loss = criterion(out, y)
            total_loss += loss.item()

            pred = out.argmax(dim=1)
            all_y.extend(y.cpu().numpy())
            all_pred.extend(pred.cpu().numpy())

    return (
        total_loss / len(loader),
        accuracy_score(all_y, all_pred),
        f1_score(all_y, all_pred, average="macro"),
        all_y,
        all_pred
    )

# ============================================================
# 8. TRAINING LOOP
# ============================================================

EPOCHS = 6
for ep in range(EPOCHS):
    model.train()
    running = 0

    for X, y in tqdm(train_loader, desc=f"Epoch {ep+1}/{EPOCHS}"):
        optimizer.zero_grad()
        out = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        running += loss.item()

    val_loss, val_acc, val_f1, _, _ = evaluate(val_loader)

    print(f"\nEpoch {ep+1}: "
          f"TrainLoss={running/len(train_loader):.4f}  "
          f"ValLoss={val_loss:.4f}  ValAcc={val_acc:.4f}  ValF1={val_f1:.4f}")

# ============================================================
# 9. FINAL TEST RESULTS
# ============================================================

test_loss, test_acc, test_f1, y_true, y_pred = evaluate(test_loader)

print("\n--- FINAL TEST RESULTS ---")
print(f"Loss={test_loss:.4f}  Accuracy={test_acc:.4f}  F1={test_f1:.4f}")

# ============================================================
# 10. SAFE CLASSIFICATION REPORT
# ============================================================

valid_labels = unique_labels(y_true, y_pred)
valid_label_names = [LABELS[i] for i in valid_labels]

print("\nPredicted Classes:", valid_label_names)

report = classification_report(
    y_true,
    y_pred,
    labels=valid_labels,
    target_names=valid_label_names,
    output_dict=True
)

df_report = pd.DataFrame(report).transpose()
df_report = df_report[["precision", "recall", "f1-score", "support"]]
df_report[["precision", "recall", "f1-score"]] = df_report[["precision", "recall", "f1-score"]].round(4)
df_report["support"] = df_report["support"].astype(int)

print("\nClassification Report:")
print(tabulate(df_report, headers="keys", tablefmt="github"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda
Initial Classes: ['mixed', 'negative', 'neutral', 'neutral/positive', 'positive']
label
positive            3626
neutral             1270
negative             649
mixed                  2
neutral/positive       1
Name: count, dtype: int64

After removing rare classes:
Classes: ['mixed', 'negative', 'neutral', 'positive']
label
positive    3626
neutral     1270
negative     649
mixed          2
Name: count, dtype: int64

Train=4437, Val=555, Test=555

ðŸ”¹ Loading RoBERTa...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa Embedding Dim: 768


Epoch 1/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:28<00:00,  9.70it/s]



Epoch 1: TrainLoss=0.7293  ValLoss=0.5902  ValAcc=0.7604  ValF1=0.6083


Epoch 2/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:25<00:00, 10.73it/s]



Epoch 2: TrainLoss=0.5898  ValLoss=0.5168  ValAcc=0.7820  ValF1=0.6543


Epoch 3/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:25<00:00, 11.08it/s]



Epoch 3: TrainLoss=0.5429  ValLoss=0.4731  ValAcc=0.8036  ValF1=0.7118


Epoch 4/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:25<00:00, 10.88it/s]



Epoch 4: TrainLoss=0.5431  ValLoss=0.4707  ValAcc=0.8000  ValF1=0.7000


Epoch 5/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:25<00:00, 10.69it/s]



Epoch 5: TrainLoss=0.5194  ValLoss=0.4921  ValAcc=0.7856  ValF1=0.6415


Epoch 6/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:25<00:00, 11.01it/s]



Epoch 6: TrainLoss=0.5000  ValLoss=0.4710  ValAcc=0.7892  ValF1=0.6697

--- FINAL TEST RESULTS ---
Loss=0.4610  Accuracy=0.7946  F1=0.6913

Predicted Classes: ['negative', 'neutral', 'positive']

Classification Report:
|              |   precision |   recall |   f1-score |   support |
|--------------|-------------|----------|------------|-----------|
| negative     |      0.6579 |   0.7692 |     0.7092 |        65 |
| neutral      |      0.6866 |   0.3622 |     0.4742 |       127 |
| positive     |      0.8374 |   0.9504 |     0.8903 |       363 |
| accuracy     |      0.7946 |   0.7946 |     0.7946 |         0 |
| macro avg    |      0.7273 |   0.6939 |     0.6913 |       555 |
| weighted avg |      0.7818 |   0.7946 |     0.7739 |       555 |
