In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load dataset
df = pd.read_csv('train.csv')
df.columns = ['tweets', 'class']

# Binary label: sarcasm = 1, all others = 0
df['label'] = df['class'].apply(lambda x: 1 if x.lower() == 'sarcasm' else 0)

# Text cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = re.sub(r"[^a-z0-9\s']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['cleaned_tweet'] = df['tweets'].apply(clean_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_tweet'], df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# TF-IDF vectorizer with n-grams
vectorizer = TfidfVectorizer(max_features=20000, sublinear_tf=True, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train logistic regression with class weights
model = LogisticRegression(C=0.75, class_weight='balanced', max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


# Predict sarcasm on a custom input
while True:
    user_input = input("\nEnter a sentence (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    cleaned_input = clean_text(user_input)
    input_vec = vectorizer.transform([cleaned_input])
    prediction = model.predict(input_vec)[0]
    proba = model.predict_proba(input_vec)[0]

    if prediction == 1:
        print("Prediction: Sarcastic 😏")
    else:
        print("Prediction: Regular 🙂")

    print(f"Confidence - Regular: {proba[0]*100:.2f}% | Sarcastic: {proba[1]*100:.2f}%")



Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.72      0.79     12146
           1       0.45      0.67      0.54      4136

    accuracy                           0.71     16282
   macro avg       0.66      0.69      0.66     16282
weighted avg       0.76      0.71      0.72     16282

Confusion Matrix:
 [[8741 3405]
 [1373 2763]]
