In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [18]:
# Load dataset
data = pd.read_csv('spam.csv', encoding='latin1')

In [19]:
# Select only relevant columns
data = data[['v1', 'v2']]
data.columns = ['label', 'message']


In [20]:
# Clean and map labels
data['label'] = data['label'].str.strip()  # Remove extra spaces
data = data[data['label'].isin(['ham', 'spam'])]  # Filter valid labels
data['label'] = data['label'].map({'ham': 0, 'spam': 1})  # Map to 0 and 1

In [21]:
# Drop missing values if any remain
data = data.dropna(subset=['label', 'message'])


In [22]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

In [23]:
# Transform text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [24]:
# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)  # Fit the model
nb_predictions = nb_model.predict(X_test_tfidf)

In [25]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_predictions = lr_model.predict(X_test_tfidf)

In [26]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_test_tfidf)

In [27]:
# 6. Evaluate Models
print("Naive Bayes Results:")
print(classification_report(y_test, nb_predictions))
print("Accuracy:", accuracy_score(y_test, nb_predictions))

print("\nLogistic Regression Results:")
print(classification_report(y_test, lr_predictions))
print("Accuracy:", accuracy_score(y_test, lr_predictions))

print("\nSVM Results:")
print(classification_report(y_test, svm_predictions))
print("Accuracy:", accuracy_score(y_test, svm_predictions))

Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Accuracy: 0.9623318385650225

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.967713004484305

SVM Results:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   mac