In [5]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, roc_auc_score

df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

df['label'] = df['label'].map({'ham': 0, 'spam': 1})  
df['message'] = df['message'].str.lower()  
df['message'] = df['message'].str.replace(r'\d+', '', regex=True) 
df['message'] = df['message'].str.replace(r'[^\w\s]', '', regex=True) 
df['message'] = df['message'].str.strip() 


Logistic Regression
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

ROC AUC: 0.9836522294940737

Naive Bayes
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.73      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.96      0.96      1115

ROC AUC: 0.9800047243875665

Support Vector Machine
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.97      0.89      0.93       1



In [None]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42

In [9]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
#Training
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': LinearSVC()
}

# Evaluate
for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    # For models that support probability or decision_function
    if hasattr(model, 'decision_function'):
        y_scores = model.decision_function(X_test_tfidf)
        roc_auc = roc_auc_score(y_test, y_scores)
    elif hasattr(model, 'predict_proba'):
        y_scores = model.predict_proba(X_test_tfidf)[:, 1]
        roc_auc = roc_auc_score(y_test, y_scores)
    else:
        roc_auc = "N/A"

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc)


Logistic Regression
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

ROC AUC: 0.9836522294940737

Naive Bayes
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.73      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.96      0.96      1115

ROC AUC: 0.9800047243875665

Support Vector Machine
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.97      0.89      0.93       1

