In [1]:
import pandas as pd
import json
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle

data = []
for line in open("Sarcasm_Headlines_Dataset.json", "r"):
    data.append(json.loads(line))

df = pd.DataFrame(data)
df = df[['headline', 'is_sarcastic']]
df.rename(columns={'headline':'text','is_sarcastic':'label'}, inplace=True)

def normalize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text

df['text_norm'] = df['text'].apply(normalize)

# Marker feature
markers = ["oh great","yeah right","love that","just what i needed","sure"]
df['has_marker'] = df['text'].str.lower().apply(lambda t: int(any(m in t for m in markers)))


In [2]:
X = df['text_norm']
y = df['label']
X_train, X_test, y_train, y_test, df_train, df_test = train_test_split(
    X, y, df, test_size=0.2, stratify=y, random_state=42
)

tfidf_word = TfidfVectorizer(max_features=5500, ngram_range=(1,2), stop_words='english')
tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=3000)

Xw_train = tfidf_word.fit_transform(X_train)
Xw_test  = tfidf_word.transform(X_test)
Xc_train = tfidf_char.fit_transform(X_train)
Xc_test  = tfidf_char.transform(X_test)


In [3]:
train_marker = np.array(df_train['has_marker']).reshape(-1,1)
test_marker  = np.array(df_test['has_marker']).reshape(-1,1)

X_train_final = hstack([Xw_train, Xc_train, csr_matrix(train_marker)])
X_test_final  = hstack([Xw_test, Xc_test, csr_matrix(test_marker)])

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
models = {
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=2000),
    'LinearSVC': LinearSVC(class_weight='balanced', max_iter=2000),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(n_estimators=200),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=200),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=200),
    'AdaBoost': AdaBoostClassifier(n_estimators=200),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5)
}

In [5]:
results = []

for name, model in models.items():
    print(f"\n{name}:")
    model.fit(X_train_final, y_train)
    preds = model.predict(X_test_final)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")
    results.append([name, acc, f1])





LogisticRegression:
Accuracy: 0.8469, F1: 0.8294

LinearSVC:
Accuracy: 0.8413, F1: 0.8218

MultinomialNB:
Accuracy: 0.8310, F1: 0.8053

BernoulliNB:
Accuracy: 0.7965, F1: 0.7782

DecisionTree:
Accuracy: 0.6997, F1: 0.6594

RandomForest:
Accuracy: 0.8016, F1: 0.7692

ExtraTrees:
Accuracy: 0.8149, F1: 0.7843

GradientBoosting:
Accuracy: 0.7917, F1: 0.7682

AdaBoost:
Accuracy: 0.7486, F1: 0.7165

KNeighbors:
Accuracy: 0.6853, F1: 0.5393


In [6]:
results_df = pd.DataFrame(results, columns=['Model','Accuracy','F1'])
results_df = results_df.sort_values(by='F1', ascending=False).reset_index(drop=True)
print("\nSummary of 10 Models:\n", results_df)


Summary of 10 Models:
                 Model  Accuracy        F1
0  LogisticRegression  0.846874  0.829441
1           LinearSVC  0.841258  0.821849
2       MultinomialNB  0.830962  0.805262
3          ExtraTrees  0.814863  0.784297
4         BernoulliNB  0.796518  0.778209
5        RandomForest  0.801572  0.769164
6    GradientBoosting  0.791651  0.768173
7            AdaBoost  0.748596  0.716487
8        DecisionTree  0.699738  0.659448
9          KNeighbors  0.685324  0.539326


In [7]:
   # Find best model
best_idx = results_df['F1'].idxmax()
best_model_name = results_df.loc[best_idx, 'Model']
best_model_f1 = results_df.loc[best_idx, 'F1']
best_model_acc = results_df.loc[best_idx, 'Accuracy']

print(f"Best Model: {best_model_name}")
print(f"Accuracy: {best_model_acc:.4f}")
print(f"F1 Score: {best_model_f1:.4f}")

# best model
best_model_object = models[best_model_name]

import pickle
with open('best_sarcasm_model.pkl','wb') as f:
    pickle.dump(best_model_object, f)

# Save  vectorizers used
with open('tfidf_word.pkl','wb') as f:
    pickle.dump(tfidf_word, f)

with open('tfidf_char.pkl','wb') as f:
    pickle.dump(tfidf_char, f)


Best Model: LogisticRegression
Accuracy: 0.8469
F1 Score: 0.8294
