In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from tqdm import tqdm
import re
import emoji
!pip install contractions
import contractions
import torch.nn.functional as F
import numpy as np



Explanation:

* PyTorch: Core library for building and training neural networks.

* Transformers: Provides pre-trained models and tokenizers from Hugging Face.

* scikit-learn: Offers utilities for evaluation metrics and handling class imbalance.

* pandas: Facilitates data manipulation and analysis.

* tqdm: Displays progress bars for loops.

* re & emoji: Used for text preprocessing tasks.

* contractions: Expands English contractions (e.g., "can't" to "cannot").



In [None]:
#Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Specify the pretrained model name (can be changed to any Hugging Face model)
model_name = "bert-base-uncased"

#Load the tokenizer corresponding to the chosen pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)


Explanation:

* Device Setup: Utilizes GPU if available; otherwise, defaults to CPU.

* Tokenizer: Loads the BERT tokenizer to convert text into tokens suitable for the model.



In [None]:
#Custom Dataset class for sentiment analysis data
class SentimentDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts              #List of raw text samples
        self.labels = labels            #Optional list of labels
        self.tokenizer = tokenizer      #Tokenizer to encode the texts
        self.max_len = max_len          #Maximum sequence length for padding/truncation

    def __len__(self):
        #returns the number of samples
        return len(self.texts)

    def __getitem__(self, idx):
        #Tokenize and encode the text at the given index
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',       #Pad to max_len
            truncation=True,            #Truncate if longer than max_len
            max_length=self.max_len,
            return_tensors='pt'         #Return PyTorch tensors
        )

        #Prepare the dictionary of inputs for the model, squeezing batch dimension
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
        }

        #Add label tensor if labels are provided
        if self.labels is not None:
            item['Label'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item


Explanation:

* SentimentDataset: Custom dataset class that:

* Cleans and tokenizes text data.

* Prepares inputs for the BERT model.

* Handles optional labels for supervised learning.



In [None]:
train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv")
test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv")

train_df["raw_text"] = train_df["Text"]
train_df["label"] = train_df["Label"]

val_df["raw_text"] = val_df["Text"]
val_df["label"] = val_df["Label"]

test_df["raw_text"] = test_df["Text"]


Explanation:

* Data Loading: Reads training, validation, and test datasets.

* Preprocessing: Applies text cleaning to each dataset and adds relevant columns.

In [None]:
#Extract labels as NumPy arrays (not used directly here, but often for metrics or analysis)
y_train = train_df["label"].values
y_val = val_df["label"].values

#Create dataset objects for training, validation, and test sets using the SentimentDataset class
train_dataset = SentimentDataset(
    texts=train_df["Text"].tolist(),    #List of training texts
    labels=train_df["Label"].tolist(),  #Corresponding training labels
    tokenizer=tokenizer                  #Tokenizer for encoding
)

val_dataset = SentimentDataset(
    texts=val_df["Text"].tolist(),      #list of validation texts
    labels=val_df["Label"].tolist(),    #Corresponding validation labels
    tokenizer=tokenizer
)

test_dataset = SentimentDataset(
    texts=test_df["Text"].tolist(),     #Test texts (labels not provided)
    tokenizer=tokenizer
)

#Create DataLoaders to efficiently batch and optionally shuffle data during training/evaluation
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # Shuffle for training
val_loader = DataLoader(val_dataset, batch_size=64)                    # No shuffle for validation
test_loader = DataLoader(test_dataset, batch_size=64)                  # No shuffle for test


Explanation:

* Datasets: Creates instances of the custom dataset for training, validation, and testing.

* DataLoaders: Facilitates batch processing and shuffling for model training and evaluation.

In [None]:
#Custom binary classifier using a pretrained BERT model as the base
class BertBinaryClassifier(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.bert = base_model                     #Pretrained BERT model
        self.dropout = nn.Dropout(0.4162992069057314)          #Dropout layer for regularization
        #linear layer to map BERT's hidden state to 2 output classes (binary classification)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        #Forward pass through BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        #Extract [CLS] token representation from last hidden state (batch_size, hidden_size)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)                #Apply dropout
        return self.classifier(x)                   #Get logits for 2 classes


Explanation:

* BertBinaryClassifier: Custom model that:

* Utilizes a pre-trained BERT model.

1. Applies dropout for regularization.

1. Adds a linear layer for binary classification based on the [CLS] token representation.

In [None]:
config = AutoConfig.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name, config=config)
model = BertBinaryClassifier(base_model).to(device)


Explanation:

* Model Setup: Loads the pre-trained BERT model and initializes the custom classifier, moving it to the appropriate device (GPU or CPU).

In [None]:
#Compute balanced class weights to handle class imbalance in training data
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=[0, 1],                #Classes for binary classification
    y=train_df["Label"]           # abels from training data
)

#Convert class weights to a PyTorch tensor and move to the current device (CPU or GPU)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

#Define the loss function with class weights to penalize underrepresented classes more
criterion = nn.CrossEntropyLoss(weight=class_weights)

#Set up the AdamW optimizer with a specified learning rate
optimizer = optim.AdamW(model.parameters(), lr=1.2298191651702114e-05)


Explanation:

* Class Weights: Computes weights to handle class imbalance in the dataset.

* Loss Function: Uses weighted cross-entropy loss for classification.

* Optimizer: Employs AdamW optimizer with a specified learning rate for training.

In [None]:
for epoch in range(3):
    model.train()  #Set model to training mode
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        #Move inputs and labels to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["Label"].to(device)

        optimizer.zero_grad()             #Clear gradients
        outputs = model(input_ids, attention_mask)  #Forward pass
        loss = criterion(outputs, labels)             #Compute loss
        loss.backward()                  #Backpropagation
        optimizer.step()                 #Update parameters

        loop.set_postfix(loss=loss.item())  #Show current loss

    model.eval()  #Set model to evaluation mode
    preds, true_labels = [], []
    val_targets = []
    val_probs = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["Label"].to(device)

            outputs = model(input_ids, attention_mask)
            pred = torch.argmax(outputs, dim=1)  #Get predicted classes

            preds.extend(pred.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            #Apply softmax to convert logits to probabilities
            probs = F.softmax(outputs, dim=1)[:, 1] # Get probability of the positive class (class 1)
    
            #Extend lists with the current batch's data
            val_targets.extend(labels.cpu().numpy())
            val_probs.extend(probs.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    print(f"\nValidation Accuracy: {acc:.4f}")
    print(classification_report(true_labels, preds))


Explanation:

* Training Loop: Iterates over epochs, performing forward and backward passes, and updates model parameters.

* Evaluation: After each epoch, evaluates the model on the validation set and prints accuracy and classification metrics.

In [None]:
# === Generate Submission ===
model.eval()
# --- Setup for submission and t-SNE ---
submission_preds = []
bert_embeddings_list = []
bert_pred_labels_list = []
NUM_SAMPLES_FOR_TSNE = 2000 # Limit samples for faster plotting

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting and Generating Embeddings"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # --- Part 1: Get predictions (for submission) ---
        # Call the full model to get the final logits
        logits = model(input_ids, attention_mask)
        predictions = torch.argmax(logits, dim=1)
        submission_preds.extend(predictions.cpu().numpy())

        # --- Part 2: Get embeddings (for t-SNE) ---
        # To get embeddings, call the internal .bert module directly.
        # This returns the object that has the .last_hidden_state attribute.
        if len(bert_pred_labels_list) < NUM_SAMPLES_FOR_TSNE:
            bert_output = model.bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = bert_output.last_hidden_state[:, 0, :] # Extract [CLS] token
            
            bert_embeddings_list.append(cls_embeddings.cpu())
            bert_pred_labels_list.extend(predictions.cpu().numpy()) # Use predicted labels for color

# --- Create Submission File ---
test_df["Label"] = submission_preds
test_df[["ID", "Label"]].to_csv("submission.csv", index=False)
print("✅ Submission saved using BERT")

# --- Prepare data for t-SNE plot ---
bert_embeddings = torch.cat(bert_embeddings_list, dim=0).numpy()
bert_labels = np.array(bert_pred_labels_list)



In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import emoji
from collections import Counter
from sklearn.metrics import (
    accuracy_score, classification_report, roc_curve, auc, confusion_matrix
)
from sklearn.manifold import TSNE


In [None]:
# 🌥️ Generate Word Cloud
def plot_wordcloud(text_series, filename):
    text = ' '.join(text_series.dropna().astype(str))
    wc = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(filename, bbox_inches='tight')
    plt.close()


In [None]:
# 📏 Compare token counts before and after cleaning
def plot_length_distribution(raw_texts, clean_texts, filename='text_len_dist.png'):
    raw_lengths = raw_texts.dropna().astype(str).str.split().apply(len)
    clean_lengths = clean_texts.dropna().astype(str).str.split().apply(len)

    plt.figure(figsize=(10, 5))
    plt.hist(raw_lengths, bins=50, alpha=0.6, label='Raw', color='red')
    plt.hist(clean_lengths, bins=50, alpha=0.6, label='Clean', color='green')
    plt.xlabel("Text Length (Tokens)")
    plt.ylabel("Frequency")
    plt.title("Text Length Distribution: Raw vs Clean")
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()
    plt.close()


In [None]:
# 😂 Extract emojis from a string
def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]

# 📊 Plot most common emojis
def plot_emoji_freq(texts, top_n=10, filename='emoji_freq.png'):
    all_emojis = []
    for text in texts.dropna().astype(str):
        all_emojis.extend(extract_emojis(text))

    top_emojis = Counter(all_emojis).most_common(top_n)
    if not top_emojis:
        print("No emojis found.")
        return

    labels, values = zip(*top_emojis)

    plt.figure(figsize=(8, 4))
    plt.bar(labels, values, color='orange')
    plt.title("Top Emojis Before Conversion")
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()
    plt.close()


In [None]:
# 🧭 Visualize sentiment polarity from TextBlob
def plot_sentiment_distribution(texts, filename='sent_pol.png'):
    texts = texts.dropna().astype(str)
    polarities = [TextBlob(text).sentiment.polarity for text in texts]

    plt.figure(figsize=(10, 5))
    plt.hist(polarities, bins=50, color='blue', alpha=0.7)
    plt.xlabel("Sentiment Polarity")
    plt.ylabel("Frequency")
    plt.title("Sentiment Polarity Distribution (TextBlob)")
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()
    plt.close()


In [None]:
# 🧮 Visualize class balance in train/val sets
def plot_class_distribution(y_train, y_val, filename='train+val_dist.png'):
    labels = ['Train 0', 'Train 1', 'Val 0', 'Val 1']
    values = [
        (y_train == 0).sum(), (y_train == 1).sum(),
        (y_val == 0).sum(), (y_val == 1).sum()
    ]

    plt.figure(figsize=(8, 4))
    plt.bar(labels, values, color=['red', 'blue', 'red', 'blue'])
    plt.title("Class Distribution in Train and Validation Sets")
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()
    plt.close()


In [None]:
# ✅ Run all plots (ensure your data is defined: train_df, clean_text, y_train, y_val, etc.)
plot_wordcloud(train_df['Text'], 'raw_wc.png')

plot_emoji_freq(train_df['raw_text'])
plot_class_distribution(y_train, y_val)


In [None]:
# 📈 Plot ROC Curve
fpr, tpr, _ = roc_curve(val_targets, val_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Validation ROC Curve")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# 🔍 t-SNE Plot of BERT Embeddings
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=42)
bert_tsne_embeds = tsne.fit_transform(bert_embeddings)

plt.figure(figsize=(10, 6))
colors = ['red' if l == 0 else 'green' for l in bert_labels]
plt.scatter(bert_tsne_embeds[:, 0], bert_tsne_embeds[:, 1], c=colors, alpha=0.6)
plt.title("t-SNE Visualization of BERT [CLS] Embeddings")
plt.xlabel("t-SNE Dim 1")
plt.ylabel("t-SNE Dim 2")
plt.legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Negative', markerfacecolor='red', markersize=8),
    plt.Line2D([0], [0], marker='o', color='w', label='Positive', markerfacecolor='green', markersize=8)
])
plt.tight_layout()
plt.show()


In [None]:
# 📉 Confusion Matrix
val_probs = np.array(val_probs)
val_preds = (val_probs >= 0.5).astype(int)
cm = confusion_matrix(val_targets, val_preds)
labels = ['Negative', 'Positive']

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix on Validation Set')
plt.tight_layout()
plt.show()
