In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

print("Libraries imported successfully.")

In [None]:
# Load both datasets
# The 'lines=True' argument is necessary for JSON-per-line format
df1 = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
df2 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)

# Combine them into one dataframe
df = pd.concat([df1, df2], ignore_index=True)

# We only need the 'headline' (our feature) and 'is_sarcastic' (our label)
df = df[['headline', 'is_sarcastic']]

# Drop any duplicates just in case
df = df.drop_duplicates().reset_index(drop=True)

print(f"Total headlines loaded: {len(df)}")
df.head()

In [None]:
# 1. Punctuation Features
df['exclamation_count'] = df['headline'].apply(lambda x: x.count('!'))
df['question_count'] = df['headline'].apply(lambda x: x.count('?'))

# 2. Exaggeration Features
df['all_caps_count'] = df['headline'].apply(lambda x: len(re.findall(r"\b[A-Z]{2,}\b", x)))

# 3. Length Feature
df['word_count'] = df['headline'].apply(lambda x: len(x.split()))

print("Feature engineering complete.")
df.head()

In [None]:
# Define our columns
text_feature = 'headline'
meta_features = ['exclamation_count', 'question_count', 'all_caps_count', 'word_count']
target = 'is_sarcastic'

# Create X (features) and y (target)
X = df.drop(target, axis=1)
y = df[target]

# Split the data into training and testing sets
# We'll use the test set at the very end to generate Predictions.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# Create the transformer for text features (TF-IDF)
text_transformer = TfidfVectorizer(
    ngram_range=(1, 2),  # Use 1-word and 2-word phrases
    stop_words='english',
    max_features=5000    # Limit to the top 5000 features
)

# Create the transformer for our numerical meta-features
meta_transformer = StandardScaler()

# Use ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', text_transformer, text_feature),
        ('meta', meta_transformer, meta_features)
    ],
    remainder='drop' 
)

In [None]:
# Pipeline 1: Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

# Pipeline 2: Support Vector Machine (LinearSVC)
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(random_state=42))
])

# --- Evaluate using 5-Fold Cross-Validation on the TRAINING data ---
print("--- Model Evaluation (Cross-Validation) ---")

# Evaluate Logistic Regression
lr_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Logistic Regression Mean Accuracy: {lr_scores.mean():.4f}")

# Evaluate SVM
svm_scores = cross_val_score(svm_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Linear SVM Mean Accuracy: {svm_scores.mean():.4f}")

# --- Decision ---
# Based on the scores, we'll choose the best one. Let's assume SVM wins.
print("\nSVM performed better. Selecting it as the final model.")
final_model = svm_pipeline

In [None]:
# 1. Train the final model on the *entire* training set
final_model.fit(X_train, y_train)

# 2. Make predictions on the unseen X_test
test_predictions = final_model.predict(X_test)

# 3. Check the final accuracy on the test set
final_accuracy = accuracy_score(y_test, test_predictions)
print(f"Final Model Accuracy on Test Set: {final_accuracy:.4f}")

# 4. Create the Predictions.csv file
# We'll create a new DataFrame to hold the results
# It's good practice to include the original headline and the prediction
predictions_df = pd.DataFrame({
    'headline': X_test['headline'],  # The original headline from the test set
    'prediction': test_predictions     # Our model's 0 or 1 prediction
})

# 5. Save to CSV
predictions_df.to_csv('Predictions.csv', index=False)

print("\n'Predictions.csv' file has been created successfully!")
predictions_df.head()