In [2]:
# ================================================
# 💬 Smart Sarcasm Detector - Model Training (with Confidence Scores)
# ================================================

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("sarcasm.csv")
df.rename(columns={"tweet": "text", "is_sarcastic": "label"}, inplace=True)

# -----------------------------
# 2. Clean text
# -----------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|[^a-z\s]", "", text)
    return text.strip()

df["clean_tweet"] = df["text"].apply(clean_text)

# -----------------------------
# 3. Feature extraction
# -----------------------------
def extract_features(text):
    cues = [
        "yeah right", "sure", "obviously", "totally", "great", "love", "perfect",
        "amazing", "as if", "just what i needed", "oh wonderful", "thanks a lot"
    ]
    cue_present = int(any(cue in text for cue in cues))
    contrast = int(any(w in text for w in ["not", "but", "though", "however", "although"]))
    return [cue_present, contrast]

def irony_features(text):
    pos_words = ['love', 'great', 'amazing', 'wonderful', 'happy', 'perfect', 'nice', 'enjoy']
    neg_words = ['boring', 'bad', 'hate', 'terrible', 'worst', 'ugly', 'awful', 'sad']
    polite = ['ok', 'fine', 'thanks', 'good']
    rude = ['stupid', 'bad', 'dumb', 'ugly', 'taste']

    pos_neg_mix = int(any(p in text for p in pos_words) and any(n in text for n in neg_words))
    polite_rude_mix = int(any(p in text for p in polite) and any(r in text for r in rude))
    self_neg = int('i' in text and any(w in text for w in ['not','don’t','never','no']))

    words = text.split()
    flip = 0
    for i in range(1,len(words)):
        if any(w in words[i-1] for w in pos_words) and any(w in words[i] for w in neg_words): 
            flip = 1
            break
        if any(w in words[i-1] for w in neg_words) and any(w in words[i] for w in pos_words): 
            flip = 1
            break

    return [pos_neg_mix, polite_rude_mix, self_neg, flip]

# Compute handcrafted features
extra_features = np.array([extract_features(t) + irony_features(t) for t in df["clean_tweet"]])

# -----------------------------
# 4. Train-test split
# -----------------------------
X_train_text, X_test_text, y_train, y_test, X_train_extra, X_test_extra = train_test_split(
    df["clean_tweet"], df["label"], extra_features, test_size=0.3, random_state=42
)

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_extra])
X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_extra])

# -----------------------------
# 5. Train model
# -----------------------------
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_combined, y_train)

# -----------------------------
# 6. Evaluate with confidence
# -----------------------------
y_pred = model.predict(X_test_combined)
y_prob = model.predict_proba(X_test_combined)

# Confidence score for sarcastic class (class 1)
confidences = (y_prob[:, 1] * 100).round(2)

print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")

# Example of confidence preview
sample_results = pd.DataFrame({
    "Text": X_test_text.sample(10, random_state=42).values,
    "True Label": y_test.sample(10, random_state=42).values,
    "Predicted": y_pred[:10],
    "Sarcasm Confidence (%)": confidences[:10]
})
print("\n🧠 Sample Confidence Predictions:\n", sample_results)

# -----------------------------
# 7. Save model and vectorizer
# -----------------------------
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print("\n✅ Model and vectorizer saved successfully as 'model.pkl' and 'vectorizer.pkl'!")



📊 Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99        80
           1       1.00      0.99      1.00       102

    accuracy                           0.99       182
   macro avg       0.99      1.00      0.99       182
weighted avg       0.99      0.99      0.99       182

✅ Accuracy: 99.45 %

🧠 Sample Confidence Predictions:
                                                 Text  True Label  Predicted  \
0  perfect it started raining right after i washe...           1          1   
1           looking forward to my vacation next week           0          0   
2           i had a great time with my friends today           0          0   
3                 the concert last night was amazing           0          1   
4           i had a great time with my friends today           0          1   
5  yeah because nothing says fun like doing taxes...           1          1   
6           i had a great time with my