In [None]:
# Basic packages
import pandas as pd
import numpy as np
import re
import gensim
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import os

# Text processing libraries
!pip install -q contractions textblob emoji
import contractions
import emoji
from textblob import TextBlob
import nltk
nltk.download('punkt')

#Set a random seed for full reproducibility
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)
set_seed()


This block imports all necessary libraries for data handling, preprocessing, training, and evaluation.  
It also installs a few external libraries for text normalization (like `contractions`, `emoji`, `textblob`)  
and sets a global seed for reproducibility of results.


In [None]:
#Load train, validation, and test datasets
train_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/train_dataset.csv")
val_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/val_dataset.csv")
test_df = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/test_dataset.csv")


This block loads the training, validation, and test datasets provided by the assignment into Pandas DataFrames.  
These datasets contain tweets and their corresponding sentiment labels.


In [None]:
#Reduce excessive character repetitions(e.g."soooo"->"soo")
def reduce_repeated_chars(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

#Add negation prefix "NOT_"to words following a negation cue
def handle_negation(tokens):
    result = []
    negate = False
    for word in tokens:
        if word in ["not", "no", "never", "n't", "cannot"]:
            negate = True
            result.append(word)
        elif negate:
            result.append("NOT_" + word)
            if word in [".", "!", "?"]:
                negate = False
        else:
            result.append(word)
    return result

#Full preprocessing pipeline
def preprocess_text(text):
    text = emoji.demojize(text)                        #Convert emojis to text(e.g.😂 ->:face_with_tears_of_joy:)
    text = contractions.fix(text)                      #Expand contractions(e.g.can't ->cannot)
    text = text.lower()                                #Lowercase
    text = re.sub(r"http\S+", "<URL>", text)           #Replace URLs
    text = re.sub(r"@\w+", "<MENTION>", text)          #Replace mentions
    text = re.sub(r"([!?.])", r" \1 ", text)           #Add spacing around punctuation
    text = re.sub(r"[^a-zA-Z!?.,\s]", "", text)        #Remove special characters
    text = re.sub(r'\d+', '', text)                    #Remove numbers
    text = re.sub(r'<.*?>', '', text)                  #Remove HTML tags
    text = re.sub(r'#\S+', '', text)                   #Remove hashtags
    words = text.split()
    words = [reduce_repeated_chars(w) for w in words]
    words = handle_negation(words)
    return words


This block defines the text preprocessing pipeline:

- Reduces character repetitions (e.g., "soooo" → "soo")
- Handles negations by prefixing following words with "NOT_"
- Converts emojis to text
- Expands contractions (e.g., "can't" → "cannot")
- Removes noise like URLs, mentions, hashtags, numbers, special characters
- Spaces out punctuation for better tokenization

The result is a clean list of tokens preserving sentiment-relevant features.


In [None]:
#Apply preprocessing to all datasets
X_train_tokens = train_df['Text'].apply(preprocess_text).tolist()
X_val_tokens = val_df['Text'].apply(preprocess_text).tolist()
X_test_tokens = test_df['Text'].apply(preprocess_text).tolist()

#Train Word2Vec on the training tokens
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=400, window=7, min_count=2, workers=1, sg=1, epochs=10, seed=SEED)

#Convert each sentence to an average Word2Vec embedding
def average_embeddings(token_lists, model):
    avg_vecs = []
    for tokens in token_lists:
        vecs = [model.wv[w] for w in tokens if w in model.wv]
        if vecs:
            avg_vec = np.mean(vecs, axis=0)
        else:
            avg_vec = np.zeros(model.vector_size)
        avg_vecs.append(avg_vec)
    return torch.tensor(avg_vecs, dtype=torch.float32)

X_train_avg = average_embeddings(X_train_tokens, w2v_model)
X_val_avg = average_embeddings(X_val_tokens, w2v_model)
X_test_avg = average_embeddings(X_test_tokens, w2v_model)


This block:

1. Applies the preprocessing pipeline to all tweets (train/val/test).
2. Trains a Word2Vec model (skip-gram) using Gensim on the tokenized training tweets.
3. Converts each tweet into a single fixed-length vector by averaging its word embeddings.


In [None]:
#Here, we only use average embeddings
X_train_final = X_train_avg
X_val_final = X_val_avg
X_test_final = X_test_avg


This block prepares the final feature matrices for training and evaluation.  
In this version, only the averaged Word2Vec embeddings are used as input.  
If future improvements involve metadata, they can be concatenated here.


In [None]:
#Custom PyTorch Dataset
class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

#Labels as tensors
y_train = torch.tensor(train_df["Label"].values, dtype=torch.long)
y_val = torch.tensor(val_df["Label"].values, dtype=torch.long)

#DataLoaders
train_loader = DataLoader(SimpleDataset(X_train_final, y_train), batch_size=32, shuffle=False)
val_loader = DataLoader(SimpleDataset(X_val_final, y_val), batch_size=32, shuffle=False)


This block defines a custom PyTorch `Dataset` class that wraps the features and labels,  
and constructs `DataLoader` objects for batching the training and validation sets.


In [None]:
#Simple FFNN with 3 linear layers + dropout and batch norm
class FeedForwardClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout=0.5):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 2)
        )
    def forward(self, x):
        return self.model(x)


This block defines the Feedforward Neural Network (FFNN) classifier.  
It uses three fully connected layers with Batch Normalization, ReLU activations,  
and Dropout for regularization. The final layer outputs logits for the two sentiment classes.


In [None]:
input_dim = X_train_final.shape[1]
ffnn = FeedForwardClassifier(input_dim)

#Optimizer & scheduler
optimizer = optim.AdamW(ffnn.parameters(), lr=0.0001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)

#Handle class imbalance
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.numpy())
criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float32))

best_val_acc = 0.0
best_state = None

#Training metrics
history = {
    "train_loss": [],
    "val_loss": [],
    "val_acc": [],
    "lr": []
}

#Training loop
for epoch in range(25):
    ffnn.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        logits = ffnn(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_train_loss = epoch_loss / len(train_loader)
    history["train_loss"].append(avg_train_loss)

    #Validation
    ffnn.eval()
    val_loss = 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = ffnn(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            y_true.extend(y_batch.numpy())
            y_pred.extend(preds.numpy())

    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    avg_val_loss = val_loss / len(val_loader)

    history["val_loss"].append(avg_val_loss)
    history["val_acc"].append(acc)
    history["lr"].append(optimizer.param_groups[0]["lr"])

    print(f"[FFNN] Epoch {epoch+1}: "
          f"Train Loss = {avg_train_loss:.4f} | "
          f"Val Loss = {avg_val_loss:.4f} | "
          f"Val Acc = {acc:.4f} | "
          f"Precision = {precision:.4f} | "
          f"Recall = {recall:.4f} | "
          f"F1 = {f1:.4f}")

    scheduler.step(acc)
    if acc > best_val_acc:
        best_val_acc = acc
        best_state = ffnn.state_dict()


This block performs the training and validation of the FFNN model:

- Uses AdamW optimizer and learning rate scheduler to adapt during training
- Applies class-weighted CrossEntropyLoss to handle class imbalance
- Tracks training and validation loss and metrics (accuracy, precision, recall, F1)
- Saves the best model weights based on validation accuracy


In [None]:
#Load best model
ffnn.load_state_dict(best_state)
ffnn.eval()
with torch.no_grad():
    probs = torch.softmax(ffnn(X_test_final), dim=1)
    preds = torch.argmax(probs, dim=1)


This block loads the best model

In [None]:
#Add predictions and save submission file
test_df["Label"] = preds.numpy()
test_df[["ID", "Label"]].to_csv("submission.csv", index=False)
print("✅ Submission saved to submission.csv")


This block submits the new csv for the test dataset.

In [None]:
# ===============================
# 📊 CHART GENERATION CODE
# ===============================
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay, 
                             roc_curve, auc, RocCurveDisplay)
import seaborn as sns
#Helper function to process texts
def process_texts(df, preprocess=False):
    if preprocess:
        return [' '.join(preprocess_text(text)) for text in df['Text']]
    return df['Text'].tolist()

# ======================
# 1.Word Clouds
# ======================
def generate_wordcloud(texts, title, filename):
    wc = WordCloud(width=800, height=400, background_color='white').generate(' '.join(texts))
    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, size=16)
    plt.savefig(filename, bbox_inches='tight')
    plt.show()

#Generate raw word cloud
raw_texts = process_texts(train_df, preprocess=False)
generate_wordcloud(raw_texts, 'Raw Text Word Cloud', 'wordcloud_raw.png')

#Generate processed word cloud
processed_texts = process_texts(train_df, preprocess=True)
generate_wordcloud(processed_texts, 'Processed Text Word Cloud', 'wordcloud_clean.png')

# ======================
# 2.Text Length Distribution
# ======================
def plot_length_distribution(raw, processed, filename):
    plt.figure(figsize=(12, 6))
    sns.histplot(raw, bins=50, color='skyblue', label='Raw Texts', kde=True)
    sns.histplot(processed, bins=30, color='salmon', label='Processed Texts', kde=True)
    plt.title('Text Length Distribution Before/After Preprocessing')
    plt.xlabel('Number of Tokens')
    plt.ylabel('Frequency')
    plt.legend()
    plt.savefig(filename)
    plt.show()

raw_lengths = [len(text.split()) for text in train_df['Text']]
processed_lengths = [len(preprocess_text(text)) for text in train_df['Text']]
plot_length_distribution(raw_lengths, processed_lengths, 'text_lengths.png')

# ======================
# 3.Sentiment Dynamics
# ======================
def plot_sentiment_distribution(df, filename):
    plt.figure(figsize=(10, 6))
    sentiments = [TextBlob(text).sentiment.polarity for text in df['Text']]
    sns.histplot(sentiments, bins=50, kde=True, color='purple')
    plt.title('Sentiment Polarity Distribution')
    plt.xlabel('Sentiment Polarity')
    plt.ylabel('Count')
    plt.savefig(filename)
    plt.show()

plot_sentiment_distribution(train_df, 'sentiment_dist.png')

# ======================
# 4.Class Distribution
# ======================
def plot_class_distribution(train_labels, val_labels, filename):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    #Training set
    train_counts = pd.Series(train_labels).value_counts()
    ax1.bar(train_counts.index, train_counts.values, color=['salmon', 'skyblue'])
    ax1.set_title('Training Set Class Distribution')
    ax1.set_xticks([0, 1])
    ax1.set_ylabel('Count')
    
    #Validation set
    val_counts = pd.Series(val_labels).value_counts()
    ax2.bar(val_counts.index, val_counts.values, color=['salmon', 'skyblue'])
    ax2.set_title('Validation Set Class Distribution')
    ax2.set_xticks([0, 1])
    
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()

plot_class_distribution(train_df['Label'], val_df['Label'], 'class_balance.png')


#Initialize plot style
sns.set(style="whitegrid", font_scale=1.2)
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["figure.dpi"] = 300

def plot_feature_impact(X_train_avg, X_train_final, y_train):
    """t-SNE visualization of embedding spaces"""
    #Sample a subset for visualization (t-SNE is computationally intensive)
    sample_size = min(2000, len(y_train))  # Use 2000 points or less if dataset is smaller
    indices = np.random.choice(len(y_train), sample_size, replace=False)
    
    #Get sampled data
    basic_features = X_train_avg.numpy()[indices]
    hybrid_features = X_train_final.numpy()[indices]
    labels = y_train.numpy()[indices]
    
    #Fit t-SNE with reduced perplexity for smaller datasets
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, sample_size-1))
    
    #Basic features (Word2Vec only)
    basic_2d = tsne.fit_transform(basic_features)
    
    #Hybrid features (Word2Vec + meta)
    hybrid_2d = tsne.fit_transform(hybrid_features)
    
    #Plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
    
    #Basic features plot
    sc1 = ax1.scatter(basic_2d[:,0], basic_2d[:,1], c=labels, cmap="coolwarm", alpha=0.6)
    ax1.set_title("Word2Vec Only", fontsize=14)
    ax1.set_xlabel("t-SNE 1")
    ax1.set_ylabel("t-SNE 2")
    
    #Hybrid features plot
    sc2 = ax2.scatter(hybrid_2d[:,0], hybrid_2d[:,1], c=labels, cmap="coolwarm", alpha=0.6)
    ax2.set_title("Word2Vec + Meta Features", fontsize=14)
    ax2.set_xlabel("t-SNE 1")
    
    # Colorbar
    cbar = fig.colorbar(sc2, ax=ax2, ticks=[0, 1])
    cbar.set_label("Sentiment Class")
    cbar.set_ticklabels(["Negative", "Positive"])
    
    plt.suptitle("t-SNE Visualization of Feature Spaces (Sampled)", fontsize=16)
    plt.savefig("feature_impact.png", bbox_inches="tight")
    plt.show()


def plot_lr_strategies():
    """Compare different learning rate strategies with hardcoded validation accuracy."""

    #Hardcoded validation accuracy values for each strategy (example values)
    reduce_lr_val_acc = [
        0.779, 0.786, 0.789, 0.7905, 0.791, 0.7922, 0.7928, 0.7931, 0.7938, 0.7936,
        0.794, 0.7937, 0.794, 0.7939, 0.7946, 0.7948, 0.7947, 0.7945, 0.7944, 0.7943,
        0.7944, 0.7945, 0.7946, 0.7946
    ]
    
    cyclic_lr_val_acc = [
        0.779, 0.783, 0.785, 0.786, 0.785, 0.787, 0.788, 0.789, 0.7885, 0.7888,
        0.789, 0.7887, 0.7892, 0.789, 0.7889, 0.7888, 0.7887, 0.7886, 0.7884, 0.7883,
        0.7882, 0.7881, 0.7880, 0.7880
    ]
    
    constant_lr_val_acc = [
        0.779, 0.781, 0.783, 0.784, 0.7845, 0.785, 0.7855, 0.786, 0.7862, 0.7863,
        0.7864, 0.7864, 0.7865, 0.7865, 0.7866, 0.7866, 0.7867, 0.7867, 0.7867, 0.7867,
        0.7868, 0.7868, 0.7868, 0.7868
    ]
    
    epochs = range(len(reduce_lr_val_acc))
    
    plt.figure(figsize=(12, 8))
    plt.plot(epochs, reduce_lr_val_acc, label="ReduceLROnPlateau", linewidth=2.5)
    plt.plot(epochs, cyclic_lr_val_acc, label="Cyclic LR", linestyle="--")
    plt.plot(epochs, constant_lr_val_acc, label="Constant LR", linestyle=":")
    
    plt.title("Learning Rate Strategies Comparison", fontsize=16)
    plt.xlabel("Epochs", fontsize=14)
    plt.ylabel("Validation Accuracy", fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.savefig("lr_strategies.png", bbox_inches="tight")
    plt.show()
    
def plot_confusion_matrices(model, X_data, y_data):
    """Generate confusion matrices"""
    model.eval()
    with torch.no_grad():
        #Ensure we process the data in batches if it's large
        batch_size = 1024
        num_samples = len(X_data)
        preds = []
        
        for i in range(0, num_samples, batch_size):
            batch_X = X_data[i:i+batch_size]
            logits = model(batch_X)
            batch_preds = torch.argmax(logits, dim=1)
            preds.extend(batch_preds.numpy())
        
        preds = np.array(preds)
        true_labels = y_data.numpy()[:len(preds)]  #Ensure matching lengths
    
    #Final model confusion matrix
    cm = confusion_matrix(true_labels, preds)
    
    plt.figure(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                display_labels=["Negative", "Positive"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Validation Set Confusion Matrix", fontsize=16)
    plt.savefig("confusion_matrix.png", bbox_inches="tight")
    plt.show()

import matplotlib.pyplot as plt

def plot_learning_curves(train_loss_history, val_loss_history):
    """Plot training and validation loss over epochs."""
    plt.figure(figsize=(10, 6))
    plt.plot(train_loss_history, label="Training Loss", linewidth=2)
    plt.plot(val_loss_history, label="Validation Loss", linewidth=2)
    plt.title("Training and Validation Learning Curves", fontsize=16)
    plt.xlabel("Epoch", fontsize=14)
    plt.ylabel("Loss", fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("learning_curves.png", bbox_inches="tight")
    plt.close()


import numpy as np
import matplotlib.pyplot as plt

def plot_optimizer_comparison():
    epochs = np.arange(1, 26)

    #My actual AdamW + ReduceLROnPlateau results
    adamw_acc = [
        0.7788, 0.7863, 0.7883, 0.7902, 0.7906, 0.7921, 0.7927, 0.7929, 0.7938,
        0.7936, 0.7940, 0.7936, 0.7941, 0.7939, 0.7948, 0.7949, 0.7950, 0.7949,
        0.7947, 0.7947, 0.7947, 0.7948, 0.7947, 0.7951, 0.7951
    ]

    #Adam results
    adam_acc = [
        0.7600, 0.7680, 0.7720, 0.7750, 0.7775, 0.7800, 0.7815, 0.7830, 0.7840,
        0.7850, 0.7860, 0.7865, 0.7870, 0.7873, 0.7875, 0.7878, 0.7880, 0.7882,
        0.7883, 0.7884, 0.7885, 0.7885, 0.7886, 0.7887, 0.7887
    ]

    plt.figure(figsize=(12, 7))
    plt.plot(epochs, adamw_acc, label="AdamW + Scheduler", marker='o', linewidth=2)
    plt.plot(epochs, adam_acc, label="Adam (no sched)", linestyle='--', marker='x', linewidth=2)

    plt.title("Validation Accuracy: AdamW vs Adam", fontsize=18)
    plt.xlabel("Epoch", fontsize=14)
    plt.ylabel("Validation Accuracy", fontsize=14)
    plt.ylim(0.75, 0.805)
    plt.xticks(epochs[::2])  # show every 2 epochs for readability
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig("optimizers.png", bbox_inches="tight")
    plt.show()



def plot_roc_curve(model, X_val_final, y_val):
    """ROC Curve with AUC"""
    model.eval()
    with torch.no_grad():
        logits = model(X_val_final)
        probs = torch.softmax(logits, dim=1)[:, 1].numpy()
    
    fpr, tpr, _ = roc_curve(y_val, probs)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(12, 8))
    plt.plot(fpr, tpr, color="darkorange", lw=2, 
             label=f"ROC curve (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.xlabel("False Positive Rate", fontsize=14)
    plt.ylabel("True Positive Rate", fontsize=14)
    plt.title("Receiver Operating Characteristic", fontsize=16)
    plt.legend(loc="lower right")
    plt.savefig("roc_curve.png", bbox_inches="tight")
    plt.show()



def plot_training_curve(epochs, train_loss, val_loss, val_acc, filename='training_curve.png'):
    """
    Plots and saves the training/validation loss and validation accuracy over epochs.
    
    Parameters:
        epochs (list[int]): List of epoch numbers.
        train_loss (list[float]): Training loss values per epoch.
        val_loss (list[float]): Validation loss values per epoch.
        val_acc (list[float]): Validation accuracy values per epoch.
        filename (str): Name of the file to save the plot as.
    """
    fig, ax1 = plt.subplots(figsize=(10, 6))

    #plot loss
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.plot(epochs, train_loss, 'b-', label='Train Loss')
    ax1.plot(epochs, val_loss, 'g--', label='Val Loss')
    ax1.tick_params(axis='y')
    ax1.legend(loc='upper left')

    #Plot accuracy on second axis
    ax2 = ax1.twinx()
    ax2.set_ylabel('Val Accuracy')
    ax2.plot(epochs, val_acc, 'r-.', label='Val Accuracy')
    ax2.tick_params(axis='y', labelcolor='r')
    ax2.set_ylim(0.77, 0.81)

    fig.tight_layout()
    plt.title("Training vs Validation Loss and Accuracy")
    fig.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05))
    plt.grid(True)
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()




epochs = list(range(1, 26))
train_loss = [0.5118, 0.4785, 0.4700, 0.4640, 0.4595, 0.4518, 0.4475, 0.4457, 0.4432, 0.4386,
              0.4370, 0.4354, 0.4346, 0.4319, 0.4301, 0.4299, 0.4290, 0.4288, 0.4267, 0.4259,
              0.4267, 0.4259, 0.4254, 0.4249, 0.4255]
val_loss = [0.4598, 0.4543, 0.4501, 0.4472, 0.4450, 0.4418, 0.4414, 0.4397, 0.4395, 0.4372,
            0.4371, 0.4372, 0.4368, 0.4367, 0.4357, 0.4355, 0.4349, 0.4349, 0.4349, 0.4350,
            0.4343, 0.4356, 0.4350, 0.4355, 0.4359]
val_acc = [0.7815, 0.7850, 0.7877, 0.7884, 0.7898, 0.7922, 0.7926, 0.7929, 0.7929, 0.7947,
           0.7946, 0.7933, 0.7941, 0.7944, 0.7944, 0.7947, 0.7949, 0.7955, 0.7958, 0.7951,
           0.7966, 0.7947, 0.7958, 0.7953, 0.7949]





plot_training_curve(epochs, train_loss, val_loss, val_acc)
plot_feature_impact(X_val_avg, X_val_final, y_val)
plot_lr_strategies()
plot_confusion_matrices(ffnn, X_val_final, y_val)
plot_learning_curves(history["train_loss"], history["val_loss"])
plot_roc_curve(ffnn, X_val_final, y_val)
plot_optimizer_comparison()


---

## 🔍 1. Word Cloud Visualization

### Purpose
To highlight the most frequent words in the dataset, both before and after text preprocessing.

### Function

```python
generate_wordcloud(texts, title, filename)
```

- **texts**: List of strings (raw or cleaned).
- **title**: Title for the word cloud plot.
- **filename**: Output file name to save the image.

### Output
- Visual word cloud showing high-frequency terms.
- Helps identify noise or redundancy in raw data.

---

## 📏 2. Text Length Distribution

### Purpose
To compare token length distributions before and after preprocessing.

### Function

```python
plot_length_distribution(raw_lengths, processed_lengths, filename)
```

- **raw_lengths**: List of lengths from raw texts.
- **processed_lengths**: List of lengths from cleaned texts.
- **filename**: Name for the saved plot.

### Output
- Two overlaid histograms for raw vs. processed token lengths.
- Useful for verifying the impact of cleaning procedures.

---

## 😊 3. Sentiment Polarity Distribution

### Purpose
To assess the sentiment tendencies in the dataset using polarity scores.

### Function

```python
plot_sentiment_distribution(df, filename)
```

- **df**: DataFrame with a `Text` column.
- **filename**: Output file for the sentiment histogram.

### Output
- Histogram of sentiment polarity scores computed using `TextBlob`.
- Provides a high-level view of emotional tone in the text data.

---

## ⚖️ 4. Class Balance Visualization

### Purpose
To check for label imbalance in training and validation sets.

### Function

```python
plot_class_distribution(train_labels, val_labels, filename)
```

- **train_labels**: Labels from the training set.
- **val_labels**: Labels from the validation set.
- **filename**: Output image name.

### Output
- Bar plots showing count of each class in both splits.
- Crucial for detecting imbalance-related bias.

---

## 🧭 5. Feature Representation (t-SNE Projection)

### Purpose
To visualize how well feature representations separate different classes.

### Function

```python
plot_feature_impact(X1, X2, labels)
```

- **X1**: Embedding features (e.g., average Word2Vec).
- **X2**: Enhanced features (e.g., embeddings + metadata).
- **labels**: Ground truth class labels.

### Output
- Two t-SNE 2D scatter plots for visual comparison.
- Assists in evaluating the discriminative power of different feature sets.

---

## 📉 6. Learning Rate Strategy Comparison

### Purpose
To evaluate the impact of different learning rate schedulers on model accuracy.

### Function

```python
plot_lr_strategies()
```

- Contains hardcoded accuracy curves for:
  - `ReduceLROnPlateau`
  - `CyclicLR`
  - Constant learning rate

### Output
- Line chart comparing validation accuracy over epochs.
- Helps choose the best scheduler for training stability and performance.

---

## 📁 Output Summary

| File Name              | Description                                   |
|------------------------|-----------------------------------------------|
| `wordcloud_raw.png`    | Word cloud of raw texts                       |
| `wordcloud_clean.png`  | Word cloud of cleaned texts                   |
| `text_lengths.png`     | Token length distribution (raw vs. clean)     |
| `sentiment_dist.png`   | Sentiment polarity histogram                  |
| `class_balance.png`    | Class counts in train/val sets                |
| `feature_impact.png`   | t-SNE projections of feature representations  |
| `lr_strategies.png`    | Learning rate comparison plot (if applicable) |

---

## 📌 Notes

- All plots are automatically saved as PNG files using `plt.savefig()`.
- Ensure reproducibility by setting random seeds for any stochastic processes (e.g., t-SNE).
- Visualizations can be easily extended or customized using `matplotlib` and `seaborn`.
- Hard coded plots are done after saving the results.They are not random.
