In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
import os

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("sarcasm2.csv")

# Clean column names
df.columns = [c.strip().lower() for c in df.columns]
text_col = 'text' if 'text' in df.columns else df.columns[0]
label_col = 'label' if 'label' in df.columns else df.columns[-1]

# -----------------------------
# 2. Clean text
# -----------------------------
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z!?]", " ", text)
    text = text.lower().strip()
    return text

df[text_col] = df[text_col].astype(str).apply(clean_text)

# -----------------------------
# 3. Encode labels
# -----------------------------
le = LabelEncoder()
df[label_col] = le.fit_transform(df[label_col])
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# -----------------------------
# 4. Remove rare labels
# -----------------------------
df = df[df[label_col].map(df[label_col].value_counts()) > 1]

# -----------------------------
# 5. Split data
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df[text_col], df[label_col],
    test_size=0.2,
    random_state=42,
    stratify=df[label_col]
)

# -----------------------------
# 6. Vectorize text
# -----------------------------
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    stop_words='english',
    max_features=8000
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# -----------------------------
# 7. Handle imbalance
# -----------------------------
smote = SMOTE(random_state=42)
X_train_vec_res, y_train_res = smote.fit_resample(X_train_vec, y_train)

# -----------------------------
# 8. Train RandomForest
# -----------------------------
model = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=200,
    random_state=42
)
model.fit(X_train_vec_res, y_train_res)

# -----------------------------
# 9. Evaluate
# -----------------------------
preds = model.predict(X_test_vec)
print("\n✅ Accuracy:", round(accuracy_score(y_test, preds),3))
print("\nClassification Report:\n", classification_report(y_test, preds))

# -----------------------------
# 10. Save model, vectorizer, label encoder
# -----------------------------
os.makedirs("model", exist_ok=True)
joblib.dump(model, "sarcasm_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")

print("\n🎉 Model, vectorizer, and label encoder saved successfully!")


Label mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1)}

✅ Accuracy: 0.977

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       115
           1       1.00      0.95      0.98       107

    accuracy                           0.98       222
   macro avg       0.98      0.98      0.98       222
weighted avg       0.98      0.98      0.98       222


🎉 Model, vectorizer, and label encoder saved successfully!
