In [None]:
# ============================================================
# Multi-Transformer Embeddings + LSTM (PyTorch 2.x)
# ============================================================

!pip install transformers scikit-learn pandas nltk tqdm torch torchvision tabulate --quiet

import torch, numpy as np, pandas as pd, nltk
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tabulate import tabulate

nltk.download('punkt')

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("âœ… Using device:", DEVICE)

# ============================================================
# 1. Load and clean dataset
# ============================================================

DATA_FILE = "/content/polygon_news_1.csv"  # <-- Update this path if needed
df = pd.read_csv(DATA_FILE)

# Combine all text columns except sentiment
text_columns = [col for col in df.columns if col != 'sentiment']
df['text'] = df[text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Keep only text and sentiment columns
df = df[['text', 'sentiment']].dropna()
df = df.rename(columns={'sentiment': 'label'})

# === Map 5 â†’ 3 sentiment categories ===
mapping = {
    'mixed': 'neutral',
    'neutral': 'neutral',
    'neutral/positive': 'positive',
    'positive': 'positive',
    'negative': 'negative'
}
df['label'] = df['label'].map(mapping).fillna('neutral')

# Encode numeric labels
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])
LABELS = list(le.classes_)

print("âœ… Label mapping used:", mapping)
print("Classes:", LABELS)
print(df['label'].value_counts())

# Remove rare labels
label_counts = df['label_id'].value_counts()
df = df[df['label_id'].isin(label_counts[label_counts > 1].index)]

# Train/Val/Test split
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label_id'])
val_df, test_df  = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df['label_id'])

print(f"Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

# ============================================================
# 2. Load MULTIPLE Transformer Models
# ============================================================

model_names = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased"
}

models = {}
for key, name in model_names.items():
    print(f"ðŸ”¹ Loading {name} ...")
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModel.from_pretrained(name).to(DEVICE)
    model.eval()
    models[key] = (tokenizer, model)

# Calculate total embedding dimension
total_embed_dim = sum(m[1].config.hidden_size for m in models.values())
print("âœ… Total concatenated embedding size:", total_embed_dim)

# ============================================================
# 3. Dataset + Multi-Transformer Collate Function
# ============================================================

class TransformerDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].astype(str).tolist()
        self.labels = df['label_id'].tolist()
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_batch_multi(batch):
    texts, labels = zip(*batch)
    all_embeds = []

    for key, (tokenizer, model) in models.items():
        enc = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True, max_length=128)
        input_ids = enc['input_ids'].to(DEVICE)
        attn_mask = enc['attention_mask'].to(DEVICE)
        with torch.no_grad():
            out = model(input_ids, attention_mask=attn_mask)
            emb = out.last_hidden_state.mean(dim=1)  # mean-pool across tokens
        all_embeds.append(emb)

    # Concatenate embeddings from all transformer models
    combined_emb = torch.cat(all_embeds, dim=1)
    labels = torch.tensor(labels, dtype=torch.long).to(DEVICE)
    return combined_emb, labels

# DataLoaders
train_ds, val_ds, test_ds = TransformerDataset(train_df), TransformerDataset(val_df), TransformerDataset(test_df)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_batch_multi)
val_loader   = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=collate_batch_multi)
test_loader  = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=collate_batch_multi)

# ============================================================
# 4. LSTM Model Definition
# ============================================================

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.3, bidirectional=True):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=bidirectional,
                            dropout=dropout if num_layers > 1 else 0.0)
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * (2 if bidirectional else 1), 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, output_dim)
        )
    def forward(self, x):
        x = x.unsqueeze(1)  # (batch, seq_len=1, input_dim)
        output, _ = self.lstm(x)
        pooled = output[:, -1, :]
        return self.fc(pooled)

model = LSTMClassifier(total_embed_dim, hidden_dim=256, output_dim=len(LABELS)).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ============================================================
# 5. Training & Evaluation
# ============================================================

def evaluate(loader):
    model.eval()
    all_labels, all_preds = [], []
    total_loss = 0
    with torch.no_grad():
        for x, y in loader:
            out = model(x)
            loss = criterion(out, y)
            total_loss += loss.item()
            preds = torch.argmax(out, dim=1)
            all_labels.extend(y.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    avg_loss = total_loss / max(1, len(loader))
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return avg_loss, acc, f1, all_labels, all_preds

# --- Training Loop ---
EPOCHS = 8
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    val_loss, val_acc, val_f1, _, _ = evaluate(val_loader)
    print(f"\nEpoch {epoch+1}: TrainLoss={running_loss/len(train_loader):.4f} "
          f"ValLoss={val_loss:.4f}  ValAcc={val_acc:.4f}  ValF1={val_f1:.4f}")

# ============================================================
# 6. Final Test Results
# ============================================================

test_loss, test_acc, test_f1, y_true, y_pred = evaluate(test_loader)
print("\n--- ðŸ§¾ Final Test Results ---")
print(f"Loss={test_loss:.4f}  Accuracy={test_acc:.4f}  F1={test_f1:.4f}")

report = classification_report(y_true, y_pred, target_names=LABELS, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report = df_report[['precision', 'recall', 'f1-score', 'support']]
df_report[['precision', 'recall', 'f1-score']] = df_report[['precision', 'recall', 'f1-score']].round(4)
df_report['support'] = df_report['support'].astype(int)

print("\nðŸ“Š Classification Report:")
print(tabulate(df_report, headers='keys', tablefmt='github'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


âœ… Using device: cuda
âœ… Label mapping used: {'mixed': 'neutral', 'neutral': 'neutral', 'neutral/positive': 'positive', 'positive': 'positive', 'negative': 'negative'}
Classes: ['negative', 'neutral', 'positive']
label
positive    3627
neutral     1272
negative     649
Name: count, dtype: int64
Train=4715, Val=416, Test=417
ðŸ”¹ Loading bert-base-uncased ...
ðŸ”¹ Loading roberta-base ...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ðŸ”¹ Loading distilbert-base-uncased ...
âœ… Total concatenated embedding size: 2304


Epoch 1/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:21<00:00,  3.64it/s]



Epoch 1: TrainLoss=0.6092 ValLoss=0.4267  ValAcc=0.8197  ValF1=0.7274


Epoch 2/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.65it/s]



Epoch 2: TrainLoss=0.4361 ValLoss=0.4739  ValAcc=0.7861  ValF1=0.7361


Epoch 3/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.64it/s]



Epoch 3: TrainLoss=0.3795 ValLoss=0.3722  ValAcc=0.8173  ValF1=0.7337


Epoch 4/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.65it/s]



Epoch 4: TrainLoss=0.3491 ValLoss=0.3124  ValAcc=0.8582  ValF1=0.7865


Epoch 5/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.65it/s]



Epoch 5: TrainLoss=0.3139 ValLoss=0.2977  ValAcc=0.8750  ValF1=0.8179


Epoch 6/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.66it/s]



Epoch 6: TrainLoss=0.2908 ValLoss=0.3373  ValAcc=0.8534  ValF1=0.7683


Epoch 7/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.66it/s]



Epoch 7: TrainLoss=0.2776 ValLoss=0.2871  ValAcc=0.8846  ValF1=0.8131


Epoch 8/8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 295/295 [01:20<00:00,  3.65it/s]



Epoch 8: TrainLoss=0.2765 ValLoss=0.2622  ValAcc=0.8846  ValF1=0.8325

--- ðŸ§¾ Final Test Results ---
Loss=0.2009  Accuracy=0.9305  F1=0.8832

ðŸ“Š Classification Report:
|              |   precision |   recall |   f1-score |   support |
|--------------|-------------|----------|------------|-----------|
| negative     |      0.8974 |   0.7292 |     0.8046 |        48 |
| neutral      |      0.8241 |   0.9271 |     0.8725 |        96 |
| positive     |      0.9778 |   0.967  |     0.9724 |       273 |
| accuracy     |      0.9305 |   0.9305 |     0.9305 |         0 |
| macro avg    |      0.8998 |   0.8744 |     0.8832 |       417 |
| weighted avg |      0.9331 |   0.9305 |     0.9301 |       417 |
