In [None]:

!pip install transformers scikit-learn pandas nltk tqdm torch torchvision tabulate --quiet

import torch, pandas as pd, numpy as np, nltk
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from tabulate import tabulate

nltk.download("punkt")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# ============================================================
# 1. PREPROCESSING 
# ============================================================

DATA_FILE = "/content/polygon_news_1.csv"
df = pd.read_csv(DATA_FILE)

required_cols = ["title", "description", "sentiment"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' missing. Available: {df.columns.tolist()}")

df["text"] = df["title"].astype(str) + " " + df["description"].astype(str)
df = df[["text", "sentiment"]].dropna()
df = df.rename(columns={"sentiment": "label"})

# Encode labels
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])
LABELS = list(le.classes_)
print("Initial Classes:", LABELS)
print(df["label"].value_counts())

# ============================================================
# 2. REMOVE RARE CLASSES (<2 samples) â€” required for stratify
# ============================================================

counts = df["label_id"].value_counts()
valid_ids = counts[counts >= 2].index.tolist()
df = df[df["label_id"].isin(valid_ids)].reset_index(drop=True)

# Refit encoder
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])
LABELS = list(le.classes_)

print("\nAfter removing rare classes:")
print("Classes:", LABELS)
print(df["label"].value_counts())

# ============================================================
# 3. SAFE 80/10/10 SPLIT
# ============================================================

train_df, temp_df = train_test_split(
    df, test_size=0.20, stratify=df["label_id"], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label_id"], random_state=42
)

print(f"\nTrain={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

# ============================================================
# 4. LOAD FINBERT
# ============================================================

FINBERT_MODEL = "ProsusAI/finbert"
print("\nðŸ”¹ Loading FinBERT...")

tokenizer = AutoTokenizer.from_pretrained(FINBERT_MODEL)
finbert = AutoModel.from_pretrained(FINBERT_MODEL).to(DEVICE)
finbert.eval()

EMBED_DIM = finbert.config.hidden_size
print("FinBERT Embedding Dim:", EMBED_DIM)

# ============================================================
# 5. Dataset + Collate
# ============================================================

class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["label_id"].tolist()
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx): return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)

    enc = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )

    input_ids = enc["input_ids"].to(DEVICE)
    attn_mask = enc["attention_mask"].to(DEVICE)

    with torch.no_grad():
        outputs = finbert(input_ids, attention_mask=attn_mask)
        emb = outputs.last_hidden_state.mean(dim=1)   # (batch, 768)

    labels = torch.tensor(labels, dtype=torch.long).to(DEVICE)
    return emb, labels

train_loader = DataLoader(TextDataset(train_df), batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(TextDataset(val_df),   batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(TextDataset(test_df),  batch_size=16, shuffle=False, collate_fn=collate_fn)

# ============================================================
# 6. SIMPLE LINEAR CLASSIFIER (NO LSTM)
# ============================================================

class FinBertClassifier(nn.Module):
    def __init__(self, input_dim, output_dim, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, output_dim)
        )
    def forward(self, x):
        return self.model(x)

model = FinBertClassifier(EMBED_DIM, len(LABELS)).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ============================================================
# 7. EVALUATION FUNCTION
# ============================================================

def evaluate(loader):
    model.eval()
    total_loss, all_y, all_pred = 0, [], []

    with torch.no_grad():
        for X, y in loader:
            out = model(X)
            loss = criterion(out, y)
            total_loss += loss.item()

            pred = out.argmax(dim=1)
            all_y.extend(y.cpu().numpy())
            all_pred.extend(pred.cpu().numpy())

    return (
        total_loss / len(loader),
        accuracy_score(all_y, all_pred),
        f1_score(all_y, all_pred, average="macro"),
        all_y,
        all_pred
    )

# ============================================================
# 8. TRAINING LOOP
# ============================================================

EPOCHS = 6
for ep in range(EPOCHS):
    model.train()
    running = 0

    for X, y in tqdm(train_loader, desc=f"Epoch {ep+1}/{EPOCHS}"):
        optimizer.zero_grad()
        out = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        running += loss.item()

    val_loss, val_acc, val_f1, _, _ = evaluate(val_loader)

    print(f"\nEpoch {ep+1}: "
          f"TrainLoss={running/len(train_loader):.4f}  "
          f"ValLoss={val_loss:.4f}  ValAcc={val_acc:.4f}  ValF1={val_f1:.4f}")

# ============================================================
# 9. FINAL TEST RESULTS
# ============================================================

test_loss, test_acc, test_f1, y_true, y_pred = evaluate(test_loader)

print("\n--- FINAL TEST RESULTS ---")
print(f"Loss={test_loss:.4f}  Accuracy={test_acc:.4f}  F1={test_f1:.4f}")

# ============================================================
# 10. SAFE CLASSIFICATION REPORT
# ============================================================

valid_labels = unique_labels(y_true, y_pred)
valid_label_names = [LABELS[i] for i in valid_labels]

print("\nPredicted Classes:", valid_label_names)

report = classification_report(
    y_true,
    y_pred,
    labels=valid_labels,
    target_names=valid_label_names,
    output_dict=True
)

df_report = pd.DataFrame(report).transpose()
df_report = df_report[["precision", "recall", "f1-score", "support"]]
df_report[["precision", "recall", "f1-score"]] = df_report[["precision", "recall", "f1-score"]].round(4)
df_report["support"] = df_report["support"].astype(int)

print("\nClassification Report:")
print(tabulate(df_report, headers="keys", tablefmt="github"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda
Initial Classes: ['mixed', 'negative', 'neutral', 'neutral/positive', 'positive']
label
positive            3626
neutral             1270
negative             649
mixed                  2
neutral/positive       1
Name: count, dtype: int64

After removing rare classes:
Classes: ['mixed', 'negative', 'neutral', 'positive']
label
positive    3626
neutral     1270
negative     649
mixed          2
Name: count, dtype: int64

Train=4437, Val=555, Test=555

ðŸ”¹ Loading FinBERT...


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

FinBERT Embedding Dim: 768


Epoch 1/6:   1%|          | 2/278 [00:00<00:39,  6.99it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch 1/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:29<00:00,  9.49it/s]



Epoch 1: TrainLoss=0.5571  ValLoss=0.5196  ValAcc=0.7928  ValF1=0.6998


Epoch 2/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:26<00:00, 10.36it/s]



Epoch 2: TrainLoss=0.5087  ValLoss=0.5031  ValAcc=0.7928  ValF1=0.6792


Epoch 3/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:26<00:00, 10.45it/s]



Epoch 3: TrainLoss=0.4825  ValLoss=0.5207  ValAcc=0.7694  ValF1=0.6378


Epoch 4/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:27<00:00, 10.04it/s]



Epoch 4: TrainLoss=0.4784  ValLoss=0.4744  ValAcc=0.7874  ValF1=0.6988


Epoch 5/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:26<00:00, 10.37it/s]



Epoch 5: TrainLoss=0.4712  ValLoss=0.4808  ValAcc=0.7964  ValF1=0.6814


Epoch 6/6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 278/278 [00:26<00:00, 10.38it/s]



Epoch 6: TrainLoss=0.4608  ValLoss=0.4823  ValAcc=0.7946  ValF1=0.6769

--- FINAL TEST RESULTS ---
Loss=0.4755  Accuracy=0.7928  F1=0.7005

Predicted Classes: ['negative', 'neutral', 'positive']

Classification Report:
|              |   precision |   recall |   f1-score |   support |
|--------------|-------------|----------|------------|-----------|
| negative     |      0.7018 |   0.6154 |     0.6557 |        65 |
| neutral      |      0.6337 |   0.5039 |     0.5614 |       127 |
| positive     |      0.8463 |   0.9256 |     0.8842 |       363 |
| accuracy     |      0.7928 |   0.7928 |     0.7928 |         0 |
| macro avg    |      0.7273 |   0.6816 |     0.7005 |       555 |
| weighted avg |      0.7807 |   0.7928 |     0.7836 |       555 |
