In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.utils.class_weight import compute_class_weight


LABELS = {"negative": 0, "neutral": 1, "positive": 2}
LABEL_NAMES = ["negative", "neutral", "positive"]  


tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")  


class YouTubeCommentDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_len=128):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            str(comment),
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing
    def forward(self, pred, target):
        n_class = pred.size(1)
        one_hot = torch.zeros_like(pred).scatter(1, target.unsqueeze(1), 1)
        smooth_one_hot = one_hot * (1 - self.smoothing) + self.smoothing / n_class
        return F.kl_div(F.log_softmax(pred, 1), smooth_one_hot, reduction='batchmean')


class BERTMultiKernelCNN(nn.Module):
    def __init__(self, n_classes=3, dropout=0.4):
        super().__init__()
        self.bert = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
        self.bert.gradient_checkpointing_enable()

        self.kernel_sizes = [2, 3, 4, 5]
        self.num_filters = 128

        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=768, out_channels=self.num_filters, kernel_size=k),
                nn.BatchNorm1d(self.num_filters),
                nn.ReLU(),
                nn.AdaptiveMaxPool1d(1)
            ) for k in self.kernel_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        total_filters = len(self.kernel_sizes) * self.num_filters
        self.fc1 = nn.Linear(total_filters, 128)
        self.fc2 = nn.Linear(128, n_classes)

        self._init_weights()

    def _init_weights(self):
        for module in [self.fc1, self.fc2]:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state  
        x = x.permute(0, 2, 1)  
        conv_outputs = [conv(x) for conv in self.convs]  
        x = torch.cat(conv_outputs, dim=1).squeeze(2)  
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(x)
        return self.fc2(x)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


CSV_PATH = "Initial_DatasetB.csv" 



df = pd.read_csv(CSV_PATH, encoding="utf-8", engine="python")


text_candidates  = ["message","comment","comments","text","content","body","clean_text","Comment","Text"]
label_candidates = ["sentiment","label","labels","polarity","sentimentlabel","Sentiment","Label","Polarity"]

def pick_col(cands, cols):
    cols_lower = {c.lower(): c for c in cols}
    for c in cands:
        if c.lower() in cols_lower:
            return cols_lower[c.lower()]
    raise ValueError(f"Could not find any of {cands} in columns: {list(cols)}")

TEXT_COL  = pick_col(text_candidates,  df.columns)
LABEL_COL = pick_col(label_candidates, df.columns)


df = df[df[TEXT_COL].notnull()]
df = df[df[TEXT_COL].astype(str).str.strip() != ""]


raw = df[LABEL_COL]

def normalize_label_series(s):

    s_str = s.astype(str).str.strip().str.lower()


    map_str = {
        "negative": 0, "neg": 0, "-1": 0, "0 (negative)": 0,
        "neutral":  1, "neu": 1, "0": 1, "neutral/other": 1,
        "positive": 2, "pos": 2, "1": 2, "2": 2, "0 (positive)": 2
    }


    mapped = s_str.map(map_str)





    if mapped.isna().any():

        s_num = pd.to_numeric(s, errors="coerce")
        unique = sorted([int(x) for x in s_num.dropna().unique().tolist()])

        if set(unique) == {-1, 0, 1}:
            num_map = {-1: 0, 0: 1, 1: 2}
            mapped  = s_num.map(num_map)
        elif set(unique) == {0, 1, 2}:
            num_map = {0: 0, 1: 1, 2: 2}
            mapped  = s_num.map(num_map)
        elif set(unique) == {0, 1}:
   
            num_map = {0: 0, 1: 2}
            mapped  = s_num.map(num_map)
        else:
   
            mapped = s_str.apply(
                lambda t: 0 if "neg" in t else (2 if "pos" in t else (1 if "neu" in t else np.nan))
            )

    return mapped.astype("Int64")  

df["label"] = normalize_label_series(raw)
df = df[df["label"].notnull()].copy()
df["label"] = df["label"].astype(int)


df = df.rename(columns={TEXT_COL: "message"})[["message", "label"]]


present = sorted(df["label"].unique().tolist())
print("Classes present (0=neg, 1=neu, 2=pos):", present)


if len(present) >= 2:
    train_df, temp_df = train_test_split(
        df, test_size=0.2, stratify=df['label'], random_state=42
    )
    val_df, test_df = train_test_split(
        temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
    )
else:

    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42)

print("Data split complete (YouTube dataset):")
for name, d in [("train", train_df), ("val", val_df), ("test", test_df)]:
    print(name, d['label'].value_counts().sort_index().to_dict())


In [None]:
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy
from torch.cuda.amp import autocast, GradScaler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch


EPOCHS = 30      
BATCH  = 64
MAXLEN = 128


NUM_CLASSES = len(sorted(train_df["label"].unique().tolist()))
assert NUM_CLASSES in (2, 3), f"Unexpected number of classes: {NUM_CLASSES}"
TARGET_NAMES_FULL = ["negative", "neutral", "positive"]
TARGET_NAMES_USED = TARGET_NAMES_FULL[:NUM_CLASSES] if NUM_CLASSES == 3 else ["negative", "positive"]


train_dataset = YouTubeCommentDataset(
    comments=train_df['message'].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAXLEN
)
val_dataset = YouTubeCommentDataset(
    comments=val_df['message'].tolist(),
    labels=val_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAXLEN
)
test_dataset = YouTubeCommentDataset(
    comments=test_df['message'].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAXLEN
)


train_loader = DataLoader(train_dataset, batch_size=BATCH, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH, shuffle=False, num_workers=0)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH, shuffle=False, num_workers=0)


model = BERTMultiKernelCNN(n_classes=NUM_CLASSES, dropout=0.4).to(device)


for df_ in (train_df, val_df, test_df):
    df_['label'] = df_['label'].astype(int)

print("Train label counts:", train_df['label'].value_counts().sort_index().to_dict())

present_classes = sorted(train_df['label'].unique().tolist())
present_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(present_classes),
    y=train_df['label'].values
)

full_weights = np.ones(NUM_CLASSES, dtype=np.float32)
for c, w in zip(present_classes, present_weights):
    if c < NUM_CLASSES:  
        full_weights[c] = float(w)
class_weights = torch.tensor(full_weights, dtype=torch.float, device=device)
print("Class weights (final):", class_weights)

criterion = torch.nn.CrossEntropyLoss(weight=class_weights)


bert_params, other_params = [], []
for name, param in model.named_parameters():
    (bert_params if 'bert' in name else other_params).append(param)

optimizer = torch.optim.Adam([
    {'params': bert_params,  'lr': 1e-5},
    {'params': other_params, 'lr': 1e-4}
])


scaler = GradScaler()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', patience=3, factor=0.5
)

best_val_acc = 0.0
epochs_no_improve = 0
early_stop_patience = 3
best_model_state = None
train_acc_list, val_acc_list = [], []

print("Beginning training...")
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    model.train()
    total_loss = 0.0
    all_preds, all_labels = [], []

    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad(set_to_none=True)
        with autocast():
            outputs = model(input_ids, attention_mask)      
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        all_preds.extend(outputs.argmax(dim=1).detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

    train_acc = accuracy_score(all_labels, all_preds)
    train_acc_list.append(train_acc)
    print(f"Train Loss: {total_loss:.4f} | Accuracy: {train_acc:.4f}")


    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            val_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    val_acc_list.append(val_acc)
    print(f"\nValidation Accuracy: {val_acc:.4f}")


    present_val = sorted(np.unique(val_labels).tolist())
    target_names_val = [TARGET_NAMES_FULL[i] for i in present_val]
    print(classification_report(
        val_labels, val_preds,
        labels=present_val,
        target_names=target_names_val,
        digits=4,
        zero_division=0
    ))

    scheduler.step(val_acc)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_no_improve = 0
        best_model_state = deepcopy(model.state_dict())
        print("New best model saved.")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epochs.")
        if epochs_no_improve >= early_stop_patience:
            print("Early stopping triggered.")
            break


if best_model_state:
    model.load_state_dict(best_model_state)
    print("Best model loaded for final evaluation.")


model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        test_preds.extend(outputs.argmax(dim=1).cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

print("\nFinal Test Set Evaluation:")
present_test = sorted(np.unique(test_labels).tolist())
target_names_test = [TARGET_NAMES_FULL[i] for i in present_test]
print(classification_report(
    test_labels, test_preds,
    labels=present_test,
    target_names=target_names_test,
    digits=4,
    zero_division=0
))


labels_for_cm = [0,1] if NUM_CLASSES == 2 else [0,1,2]
names_for_cm  = ["negative","positive"] if NUM_CLASSES == 2 else ["negative","neutral","positive"]
cm = confusion_matrix(test_labels, test_preds, labels=labels_for_cm)
plt.figure(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=names_for_cm, yticklabels=names_for_cm)
plt.title("Confusion Matrix (Test)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


if train_acc_list:
    plt.figure(figsize=(9, 5))
    plt.plot(train_acc_list, label='Train Acc', marker='o')
    plt.plot(val_acc_list, label='Val Acc', marker='s')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training vs Validation Accuracy')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

print(f"Training complete. Best Validation Accuracy: {best_val_acc:.4f}")
