In [None]:
!pip install contractions
!pip install emoji
!pip install datasets
!pip install lime

In [None]:
import os
import numpy as np
import pandas as pd
import re
import joblib
import matplotlib.pyplot as plt
import time
from copy import deepcopy

import spacy
import emoji
from contractions import fix as fix_contractions

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from transformers import BertTokenizer, TFBertForSequenceClassification, RobertaTokenizer, TFRobertaForSequenceClassification, create_optimizer
from datasets import Dataset

from wordcloud import WordCloud
from lime.lime_text import LimeTextExplainer

from google.colab import drive

In [None]:
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Ahmed_Anas_20023579_CreativePiece/')

In [None]:
twitter_df = pd.read_csv('twitter.csv', encoding = 'latin-1', header=None)

In [None]:
reddit_df = pd.read_csv('reddit-train-balanced-sarcasm.csv')

In [None]:
reddit_df.info()
reddit_df.head()

In [None]:
reddit_df.drop(columns=['author','subreddit','score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment'], inplace=True)
reddit_df.dropna(inplace=True)
reddit_df.head()

In [None]:
reddit_df['label'].value_counts()

In [None]:
twitter_df.info()
twitter_df.head()

In [None]:
twitter_df.columns = ['label', 'id', 'date', 'query', 'user_id', 'comment']
twitter_df.drop(columns=['id', 'date', 'query', 'user_id'], inplace=True)
twitter_df.dropna(inplace=True)
twitter_df['label'] = pd.to_numeric(twitter_df['label'], errors='coerce')
twitter_df['label'] = twitter_df['label'].map({0: 0, 4: 1})
twitter_df.head()

In [None]:
twitter_df.label.value_counts()

In [None]:
# Load spaCy model and stopwords
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

# Text Cleaning
def handle_negation(text):
  negation_words = ["not", "n't", "no"]
  words = text.split()
  negated = False
  result = []
  for i, word in enumerate(words):
    if word in negation_words:
      negated = True
    elif negated and word not in negation_words and not word.endswith('_NEG'):
      result.append(word + '_NEG')
      negated = False
    else:
      result.append(word)
      negated = False
  return " ".join(result)

def clean_text(text):
  text = emoji.demojize(text, delimiters=("", ""))
  text = text.lower()
  text = re.sub(r"http\S+|www\S+|https\S+", '', text)
  text = re.sub(r"@\w+|#\w+", '', text)
  text = re.sub(r"[^\w\s]", '', text)
  text = re.sub(r"\s+", ' ', text).strip()
  text = fix_contractions(text)
  doc = nlp(text)
  tokens = [token.lemma_ for token in doc if token.text not in stopwords and not token.is_space]
  processed_text = " ".join(tokens)
  processed_text = handle_negation(processed_text)
  return processed_text

def preprocess_dataframe(df, text_column):
  df[text_column] = df[text_column].astype(str).apply(clean_text)
  return df

In [None]:
# Sarcasm Detection Model (Logistic Regression)
def train_sarcasm_model(df, model_out_path, vectorizer_out_path):
  X, y = df['comment'], df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
  X_train_vec = tfidf.fit_transform(X_train)
  X_test_vec = tfidf.transform(X_test)

  clf = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
  param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
  grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1')
  grid_search.fit(X_train_vec, y_train)
  clf_tuned = grid_search.best_estimator_
  y_pred = clf_tuned.predict(X_test_vec)
  y_pred_proba = clf_tuned.predict_proba(X_test_vec)[:, 1]

  print("Sarcasm Detection Report (Logistic Regression):")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_proba))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))
  print("Best parameters:", grid_search.best_params_)

  joblib.dump(clf_tuned, model_out_path)
  joblib.dump(tfidf, vectorizer_out_path)
  return clf_tuned, tfidf, X_test, y_test, y_pred

# Sarcasm Detection Model (Naive Bayes)
def train_sarcasm_model_nb(df, model_out_path="sarcasm_model_nb.pkl", vectorizer_out_path="sarcasm_vectorizer_nb.pkl"):
  X, y = df['comment'], df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  vectorizer = CountVectorizer()
  X_train_vec = vectorizer.fit_transform(X_train)
  X_test_vec = vectorizer.transform(X_test)

  clf = MultinomialNB()
  clf.fit(X_train_vec, y_train)
  y_pred = clf.predict(X_test_vec)
  y_pred_proba = clf.predict_proba(X_test_vec)[:, 1]

  print("Sarcasm Detection Report (Naive Bayes Baseline):")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_proba))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))

  joblib.dump(clf, model_out_path)
  joblib.dump(vectorizer, vectorizer_out_path)
  return clf, vectorizer, X_test, y_test, y_pred

In [None]:
# Sentiment Analysis Model (Logistic Regression)
def train_sentiment_model(df, sarcasm_model_path=None, vectorizer_path=None, model_out_path="sentiment_model.pkl"):
  X, y = df['comment'], df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  if sarcasm_model_path and vectorizer_path:
    sarcasm_model = joblib.load(sarcasm_model_path)
    sarcasm_vectorizer = joblib.load(vectorizer_path)
    sarcasm_features = sarcasm_vectorizer.transform(X_train)
    sarcasm_preds_train = sarcasm_model.predict(sarcasm_features)
    X_train = pd.Series(sarcasm_preds_train.astype(str) + " " + X_train)
    sarcasm_features_test = sarcasm_vectorizer.transform(X_test)
    sarcasm_preds_test = sarcasm_model.predict(sarcasm_features_test)
    X_test = pd.Series(sarcasm_preds_test.astype(str) + " " + X_test)

  tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
  X_train_vec = tfidf.fit_transform(X_train)
  X_test_vec = tfidf.transform(X_test)

  clf = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
  param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
  grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1')
  grid_search.fit(X_train_vec, y_train)
  clf_tuned = grid_search.best_estimator_
  y_pred = clf_tuned.predict(X_test_vec)
  y_pred_proba = clf_tuned.predict_proba(X_test_vec)[:, 1]

  print("Sentiment Analysis Report (Logistic Regression):")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_proba))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))
  print("Best parameters:", grid_search.best_params_)

  joblib.dump(clf_tuned, model_out_path)
  joblib.dump(tfidf, model_out_path.replace(".pkl", "_vectorizer.pkl"))
  return clf_tuned, tfidf, X_test, y_test, y_pred


# Sentiment Analysis Model (Naive Bayes)
def train_sentiment_model_nb(df, sarcasm_model_path=None, vectorizer_path=None, model_out_path="sentiment_model_nb.pkl"):
  X, y = df['comment'], df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  # If sarcasm model and vectorizer provided, use sarcasm predictions as features
  if sarcasm_model_path and vectorizer_path:
    sarcasm_model = joblib.load(sarcasm_model_path)
    sarcasm_vectorizer = joblib.load(vectorizer_path)

    # Transform and predict sarcasm for train and test sets
    sarcasm_features_train = sarcasm_vectorizer.transform(X_train)
    sarcasm_preds_train = sarcasm_model.predict(sarcasm_features_train)
    X_train = pd.Series(sarcasm_preds_train.astype(str) + " " + X_train)

    sarcasm_features_test = sarcasm_vectorizer.transform(X_test)
    sarcasm_preds_test = sarcasm_model.predict(sarcasm_features_test)
    X_test = pd.Series(sarcasm_preds_test.astype(str) + " " + X_test)

  # Vectorization and model training
  vectorizer = CountVectorizer()
  X_train_vec = vectorizer.fit_transform(X_train)
  X_test_vec = vectorizer.transform(X_test)

  clf = MultinomialNB()
  clf.fit(X_train_vec, y_train)
  y_pred = clf.predict(X_test_vec)
  y_pred_proba = clf.predict_proba(X_test_vec)[:, 1]

  print("Sentiment Analysis Report (Naive Bayes with Sarcasm Feature):")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_proba))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))

  joblib.dump(clf, model_out_path)
  joblib.dump(vectorizer, model_out_path.replace(".pkl", "_vectorizer.pkl"))
  return clf, vectorizer, X_test, y_test, y_pred


In [None]:
# Sentiment Analysis Model (BERT)
def train_bert_sentiment_model(df, model_name="bert-base-uncased", model_out_path="bert_sentiment_model", epochs=3, batch_size=16, max_length=128):
  X, y = df['comment'], df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  y_train = y_train.astype(int)
  y_test = y_test.astype(int)

  tokenizer = BertTokenizer.from_pretrained(model_name)
  train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
  test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

  train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(batch_size)
  test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(batch_size)

  model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
  steps_per_epoch = len(train_dataset)
  num_train_steps = steps_per_epoch * epochs
  optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

  history = model.fit(train_dataset, epochs=epochs, validation_data=test_dataset)
  model.save_pretrained(model_out_path)
  tokenizer.save_pretrained(model_out_path)

  y_pred_logits = model.predict(test_dataset).logits
  y_pred = np.argmax(y_pred_logits, axis=1)

  print(f"BERT Sentiment Analysis Report:")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_logits[:, 1]))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))

  return model, tokenizer, history, X_test, y_test, y_pred


#Sentiment Analysis Model (roBERTa)
def train_roberta_sentiment_model(df, model_name="roberta-base", model_out_path="roberta_sentiment_model", epochs=3, batch_size=32, max_length=128):
  # Split data
  X, y = df['comment'], df['label'].astype(int)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

  # Tokenization
  tokenizer = RobertaTokenizer.from_pretrained(model_name)
  train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
  test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

  # Prepare datasets
  train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(batch_size)
  test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(batch_size)

  # Load model
  model = TFRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

  # Optimizer setup with learning rate schedule
  steps_per_epoch = len(train_dataset)
  num_train_steps = steps_per_epoch * epochs
  optimizer, lr_schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

  # Compile model
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

  # Train the model
  history = model.fit(train_dataset, validation_data=test_dataset, epochs=epochs)

  # Save model & tokenizer
  model.save_pretrained(model_out_path)
  tokenizer.save_pretrained(model_out_path)

  # Predict and evaluate
  y_pred_logits = model.predict(test_dataset).logits
  y_pred = np.argmax(y_pred_logits, axis=1)

  print("RoBERTa Sentiment Analysis Report:")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1 Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_logits[:, 1]))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))

  return model, tokenizer, history, X_test, y_test, y_pred

In [None]:
# Sentiment Analysis (LSTM)
def train_lstm_model(df, use_sarcasm=False, sarcasm_model_path=None, vectorizer_path=None, model_out_path = "lstm_sentiment_model.keras", max_len=100, num_words=10000, embedding_dim=100, lstm_units=128, dropout_rate=0.5, epochs=10, batch_size=32, learning_rate=0.001):
  X, y = df['comment'], df['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  if use_sarcasm and sarcasm_model_path and vectorizer_path:
    sarcasm_model = joblib.load(sarcasm_model_path)
    sarcasm_vectorizer = joblib.load(vectorizer_path)
    sarcasm_features = sarcasm_vectorizer.transform(X_train)
    sarcasm_preds_train = sarcasm_model.predict(sarcasm_features)
    X_train = pd.Series(sarcasm_preds_train.astype(str) + " " + X_train)
    sarcasm_features_test = sarcasm_vectorizer.transform(X_test)
    sarcasm_preds_test = sarcasm_model.predict(sarcasm_features_test)
    X_test = pd.Series(sarcasm_preds_test.astype(str) + " " + X_test)

  tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
  tokenizer.fit_on_texts(X_train)
  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_test_seq = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
  X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

  model = Sequential()
  model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_len))
  model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
  model.add(Dropout(dropout_rate))
  model.add(Dense(1, activation='sigmoid'))

  optimizer = Adam(learning_rate=learning_rate)
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', 'precision', 'recall'])
  model.summary()

  early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

  history = model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])

  model.save(model_out_path)
  joblib.dump(tokenizer, model_out_path.replace(".keras", "_tokenizer.keras"))

  y_pred_proba = model.predict(X_test_pad)
  y_pred = np.round(y_pred_proba)

  print("LSTM Learning Model Evaluation Report:")
  print(classification_report(y_test, y_pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred))
  print("AUC:", roc_auc_score(y_test, y_pred_proba))


  return model, tokenizer, history, X_test, y_test, y_pred

In [None]:
# Qualitative Analysis
def perform_qualitative_analysis(X_test, y_test, y_pred, n=10):
  print("\nQualitative Analysis (Examples of Correct and Incorrect Predictions)")

  y_test_array = np.array(y_test)
  y_pred_array = np.array(y_pred)

  correct_indices = np.where(y_test_array == y_pred_array)[0]
  incorrect_indices = np.where(y_test_array != y_pred_array)[0]

  get_text = X_test.iloc if isinstance(X_test, pd.Series) else X_test
  get_label = y_test.iloc if isinstance(y_test, pd.Series) else y_test

  print(f"\n {min(n, len(correct_indices))} Correct Predictions")
  for i in np.random.choice(correct_indices, min(n, len(correct_indices)), replace=False):
    print(f"Actual: {get_label[i]}, Predicted: {y_pred_array[i]}, Text: {get_text[i]}")

  print(f"\n {min(n, len(incorrect_indices))} Incorrect Predictions")
  for i in np.random.choice(incorrect_indices, min(n, len(incorrect_indices)), replace=False):
    print(f"Actual: {get_label[i]}, Predicted: {y_pred_array[i]}, Text: {get_text[i]}")

def explain_with_lime(model, model_type, X_text, index=0, tokenizer=None, vectorizer=None, max_len=100):
  class_names = ['Negative', 'Positive']
  explainer = LimeTextExplainer(class_names=class_names)

  text_instance = X_text.iloc[index] if hasattr(X_text, 'iloc') else X_text[index]

  # Define prediction function based on model type
  if model_type in ['logistic_regression', 'naive_bayes']:
    def predict_fn(texts):
      X_vect = vectorizer.transform(texts)
      return model.predict_proba(X_vect)

  elif model_type in ['bert', 'roberta']:
    def predict_fn(texts):
      encodings = tokenizer(list(texts), padding=True, truncation=True, return_tensors='tf')
      outputs = model(encodings)
      probs = tf.nn.softmax(outputs.logits, axis=1).numpy()
      return probs

  elif model_type == 'lstm':
    def predict_fn(texts):
      sequences = tokenizer.texts_to_sequences(texts)
      padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
      probs = model.predict(padded)
      return np.hstack([1 - probs, probs])

  else:
    raise ValueError(f"Unsupported model_type: {model_type}")

    # Generate explanation
  explanation = explainer.explain_instance(text_instance, predict_fn, num_features=10)
  explanation.show_in_notebook(text=True)

# WordCloud
def show_wordcloud(data, title='Word Cloud', mask=None, color_map='viridis', max_words=200):
  text = ' '.join(data)
  wordcloud = WordCloud(width=800, height=400, background_color='white',
                        stopwords=stopwords, mask=mask, colormap=color_map,
                        max_words=max_words).generate(text)
  plt.figure(figsize=(15, 7.5))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.title(title, fontsize=20)
  plt.show()

def generate_sentiment_wordclouds(df, sentiment_column='label', text_column='comment'):
  for sentiment in df[sentiment_column].unique():
      subset = df[df[sentiment_column] == sentiment][text_column]
      title = f"Word Cloud for Sentiment: {sentiment}"
      show_wordcloud(subset, title=title)

def generate_sarcasm_wordclouds(df, sarcasm_column='label', text_column='comment'):
  for label in df[sarcasm_column].unique():
      subset = df[df[sarcasm_column] == label][text_column]
      title = f"Word Cloud for Sarcasm Label: {label}"
      show_wordcloud(subset, title=title)

In [None]:
def perform_ablation_studies_lstm(df, max_len=100, num_words=10000, embedding_dim=100, lstm_units=128, epochs=5, batch_size=32, sarcasm_model_path=None, sarcasm_vectorizer_path=None):
  results = []

  def preprocess_text(X_raw, add_sarcasm=False):
    X = deepcopy(X_raw)
    if add_sarcasm and sarcasm_model_path and sarcasm_vectorizer_path:
      sarcasm_model = joblib.load(sarcasm_model_path)
      sarcasm_vectorizer = joblib.load(sarcasm_vectorizer_path)
      sarcasm_preds = sarcasm_model.predict(sarcasm_vectorizer.transform(X))
      X = pd.Series(sarcasm_preds.astype(str) + " " + X)
    return X

  def build_and_evaluate(model_fn, label, use_sarcasm):
    # Preprocess
    X_raw, y = df['comment'], df['label']
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)

    X_train = preprocess_text(X_train_raw, add_sarcasm=use_sarcasm)
    X_test = preprocess_text(X_test_raw, add_sarcasm=use_sarcasm)

    # Tokenize
    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

    # Train
    model = model_fn()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    start = time.time()
    model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=epochs, batch_size=batch_size, verbose=0)
    elapsed = time.time() - start

    # Evaluate
    y_probs = model.predict(X_test_pad).flatten()
    y_pred = np.round(y_probs)
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_probs)

    suffix = " (with sarcasm)" if use_sarcasm else " (baseline)"
    print(f"\n{label + suffix}")
    print(f"Time: {elapsed:.2f}s | Accuracy: {acc:.4f} | AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred, digits=4))

    results.append({
        "Model": label + suffix,
        "Accuracy": acc,
        "AUC": auc,
        "Time (s)": elapsed
        })

  # Define models
  model_variants = [
      ("Baseline LSTM", lambda: Sequential([
          Embedding(num_words, embedding_dim, input_length=max_len),
          LSTM(lstm_units),
          Dense(1, activation='sigmoid')
      ])),
      ("LSTM with Dropout", lambda: Sequential([
          Embedding(num_words, embedding_dim, input_length=max_len),
          LSTM(lstm_units, dropout=0.3, recurrent_dropout=0.3),
          Dense(1, activation='sigmoid')
        ])),
      ("Bidirectional LSTM", lambda: Sequential([
          Embedding(num_words, embedding_dim, input_length=max_len),
          Bidirectional(LSTM(lstm_units)),
          Dense(1, activation='sigmoid')
      ])),
      ("LSTM without Embedding", lambda: Sequential([
          Input(shape=(max_len,)),
          Reshape((max_len, 1)),
          LSTM(lstm_units),
          Dense(1, activation='sigmoid')
      ])),
      ("LSTM with Smaller Units", lambda: Sequential([
          Embedding(num_words, embedding_dim, input_length=max_len),
          LSTM(64),
          Dense(1, activation='sigmoid')
      ]))
  ]

  # Evaluate each model with and without sarcasm
  for label, model_fn in model_variants:
      build_and_evaluate(model_fn, label, use_sarcasm=False)
      build_and_evaluate(model_fn, label, use_sarcasm=True)

  # Plot results
  results_df = pd.DataFrame(results).set_index("Model")
  results_df[["Accuracy", "AUC"]].plot(kind="bar", figsize=(12, 6), ylim=(0, 1), colormap="viridis")
  plt.title("Model Comparison: Accuracy and AUC")
  plt.ylabel("Score")
  plt.xticks(rotation=45, ha='right')
  plt.grid(True)
  plt.tight_layout()
  plt.show()

  results_df["Time (s)"].plot(kind="bar", figsize=(10, 4), color="salmon")
  plt.title("Training Time per Model")
  plt.ylabel("Seconds")
  plt.xticks(rotation=45, ha='right')
  plt.grid(True)
  plt.tight_layout()
  plt.show()

  return results_df

In [None]:
# Preprocessing models
reddit_df_preprocessed = preprocess_dataframe(reddit_df, 'comment')
reddit_df_preprocessed.to_csv('reddit_df_preprocessed.csv', index=False)

twitter_df_preprocessed = preprocess_dataframe(twitter_df, 'comment')
twitter_df_preprocessed.to_csv('twitter_df_preprocessed.csv', index=False)

In [None]:
reddit_df_preprocessed = pd.read_csv('reddit_df_preprocessed.csv')
twitter_df_preprocessed = pd.read_csv('twitter_df_preprocessed.csv')

In [None]:
reddit_df_preprocessed.dropna(inplace=True)
reddit_df_preprocessed = reddit_df_preprocessed.sample(frac=1, random_state=42).reset_index(drop=True)

twitter_df_preprocessed.dropna(inplace=True)
twitter_df_preprocessed = twitter_df_preprocessed.sample(frac=1).reset_index(drop=True)

In [None]:
#Sarcasm model (Logistic Regression/TF-IDF)
sarcasm_clf, sarcasm_tfidf, X_test_sarcasm, y_test_sarcasm, y_pred_sarcasm_lr = train_sarcasm_model(reddit_df_preprocessed, 'sarcasm_model.pkl', 'sarcasm_vectorizer.pkl')
perform_qualitative_analysis(X_test_sarcasm, y_test_sarcasm, y_pred_sarcasm_lr, n=5)

In [None]:
#Sarcasm model (Naive Bayes)
sarcasm_clf_nb, sarcasm_vectorizer_nb, X_test_sarcasm_nb, y_test_sarcasm_nb, y_pred_sarcasm_nb = train_sarcasm_model_nb(reddit_df_preprocessed, 'sarcasm_model_nb.pkl', 'sarcasm_vectorizer_nb.pkl')
perform_qualitative_analysis(X_test_sarcasm_nb, y_test_sarcasm_nb, y_pred_sarcasm_nb, n=5)

In [None]:
#Sentiment model (Logistic Regression / TF-IDF)
clf_lr, vectorizer_lr, X_test_lr, y_test_lr, y_pred_lr = train_sentiment_model(twitter_df_preprocessed, sarcasm_model_path=None, vectorizer_path=None, model_out_path="sentiment_model_lr.pkl")
perform_qualitative_analysis(X_test_lr, y_test_lr, y_pred_lr, n=5)

In [None]:
#Sentiment model with sarcasm (Logistic Regression / TF-IDF)
clf_lr_sarc, vectorizer_lr_sarc, X_test_lr_sarc, y_test_lr_sarc, y_pred_lr_sarc = train_sentiment_model(twitter_df_preprocessed, sarcasm_model_path="sarcasm_model.pkl", vectorizer_path="sarcasm_vectorizer.pkl", model_out_path="sentiment_model_lr_sarcasm.pkl")
perform_qualitative_analysis(X_test_lr_sarc, y_test_lr_sarc, y_pred_lr_sarc, n=5)

In [None]:
# Sentiment model (Naive Bayes)
clf_nb, vectorizer_nb, X_test_nb, y_test_nb, y_pred_nb = train_sentiment_model_nb(twitter_df_preprocessed, model_out_path="sentiment_model_nb.pkl")
perform_qualitative_analysis(X_test_nb, y_test_nb, y_pred_nb, n=5)

In [None]:
#Sentiment model with sarcasm (Naive Bayes)
clf_nb_sarc, vectorizer_nb_sarc, X_test_nb_sarc, y_test_nb_sarc, y_pred_nb_sarc = train_sentiment_model_nb(twitter_df_preprocessed, sarcasm_model_path="sarcasm_model_nb.pkl", vectorizer_path="sarcasm_vectorizer_nb.pkl", model_out_path="sentiment_nb_sarcasm.pkl")
perform_qualitative_analysis(X_test_nb_sarc, y_test_nb_sarc, y_pred_nb_sarc, n=5)

In [None]:
#Reduce dataset to 5% of size (40,000 rows)
twitter_df_preprocessed = twitter_df_preprocessed.sample(frac=0.05, random_state=42).reset_index(drop=True)

In [None]:
# Deep Learning sentiment model (BERT)
with tf.device('/GPU:0'):
  bert_model, bert_tokenizer, bert_history, X_test_bert, y_test_bert, y_pred_bert = train_bert_sentiment_model(twitter_df_preprocessed, model_out_path='bert_sentiment_model')
  perform_qualitative_analysis(X_test_bert, y_test_bert, y_pred_bert, n=5)

In [None]:
# Deep Learning sentiment model (roBERTa)
with tf.device('/GPU:0'):
  roberta_model, roberta_tokenizer, roberta_history, X_test_roberta, y_test_roberta, y_pred_roberta = train_roberta_sentiment_model(twitter_df_preprocessed, model_name="roberta-base", model_out_path="roberta_sentiment_model", epochs=3, batch_size=16, max_length=128)
  perform_qualitative_analysis(X_test_roberta, y_test_roberta, y_pred_roberta, n=5)

In [None]:
# Deep Learning sentiment model (LSTM)
with tf.device('GPU:0'):
  lstm_model, lstm_tokenizer, lstm_history, X_test_lstm, y_test_lstm, y_pred_lstm = train_lstm_model(twitter_df_preprocessed, model_out_path = "lstm_sentiment_model.keras")
  perform_qualitative_analysis(X_test_lstm, y_test_lstm, y_pred_lstm, n=5)

In [None]:
# Deep Learning sentiment model with sarcasm (LSTM)
with tf.device('GPU:0'):
  lstm_model_sarc, lstm_tokenizer_sarc, lstm_history_sarc, X_test_lstm_sarc, y_test_lstm_sarc, y_pred_lstm_sarc = train_lstm_model(twitter_df_preprocessed, use_sarcasm=True, sarcasm_model_path='sarcasm_model.pkl', vectorizer_path='sarcasm_vectorizer.pkl', model_out_path='lstm_sentiment_sarc_model.h5')
  perform_qualitative_analysis(X_test_lstm_sarc, y_test_lstm_sarc, y_pred_lstm_sarc, n=5)

In [None]:
with tf.device('GPU:0'):
  perform_ablation_studies_lstm(twitter_df_preprocessed)

In [None]:
# Word Clouds
generate_sentiment_wordclouds(twitter_df_preprocessed, sentiment_column='label', text_column='comment')

In [None]:
generate_sarcasm_wordclouds(reddit_df_preprocessed, sarcasm_column='label', text_column='comment')

In [None]:
clf_lr = joblib.load("sentiment_model_lr.pkl")
vectorizer_lr = joblib.load("sentiment_model_lr_vectorizer.pkl")

lstm_model = load_model("lstm_sentiment_model.keras")
lstm_tokenizer = joblib.load("lstm_sentiment_model_tokenizer.keras")

bert_model = TFBertForSequenceClassification.from_pretrained("bert_sentiment_model")
bert_tokenizer = BertTokenizer.from_pretrained("bert_sentiment_model")

In [None]:
# Logistic Regression
explain_with_lime(model=clf_lr,model_type='logistic_regression', X_text=X_test_lr, index=3, vectorizer=vectorizer_lr)

# BERT
explain_with_lime(model=bert_model, model_type='bert', X_text=X_test_bert, index=1, tokenizer=bert_tokenizer)

# LSTM
explain_with_lime(model=lstm_model, model_type='lstm', X_text=X_test_lstm, index=1, tokenizer=lstm_tokenizer, max_len=100)

In [None]:
test_comments = ["I love this product!", "Oh wow, another delay. Just what I needed."]

In [None]:
X_vec = vectorizer_lr.transform(test_comments)
lr_preds = clf_lr.predict(X_vec)
print("Logistic Regression predictions:", lr_preds)

In [None]:
inputs_bert = bert_tokenizer(test_comments, padding=True, truncation=True, max_length=128, return_tensors='tf')
outputs_bert = bert_model(inputs_bert)
bert_preds = tf.argmax(outputs_bert.logits, axis=1).numpy()
print("BERT predictions:", bert_preds)

In [None]:
seqs = lstm_tokenizer.texts_to_sequences(test_comments)
padded = pad_sequences(seqs, maxlen=100, padding='post', truncating='post')

# Predict
lstm_probs = lstm_model.predict(padded)
lstm_preds = (lstm_probs > 0.5).astype(int).flatten()
print("LSTM predictions:", lstm_preds)