In [2]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Upload dataset
from google.colab import files
uploaded = files.upload()  # Upload your dataset here (e.g., 'spam.csv')

# Load the dataset (Assume it's named 'spam.csv')
df = pd.read_csv("spam.csv", encoding="latin-1")

# Step 3: Preprocess the data
# Keeping only the required columns
df = df.rename(columns={"v1": "label", "v2": "message"})[['label', 'message']]

# Map labels to binary (spam = 1, ham = 0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Step 4: Split the data into training and testing sets
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Step 6: Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_vect, y_train)
nb_preds = nb_model.predict(X_test_vect)

# Evaluate Naive Bayes model
print("Naive Bayes Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, nb_preds):.2f}")
print(classification_report(y_test, nb_preds))

# Step 7: Train Support Vector Machine (SVM) classifier
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_vect, y_train)
svm_preds = svm_model.predict(X_test_vect)

# Evaluate SVM model
print("Support Vector Machine Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, svm_preds):.2f}")
print(classification_report(y_test, svm_preds))

# Step 8: Test with a sample email
sample_email = ["Congratulations! You've won a free vacation to the Bahamas! Click here to claim."]
sample_vect = vectorizer.transform(sample_email)

print("Prediction for Naive Bayes:", "Spam" if nb_model.predict(sample_vect)[0] == 1 else "Ham")
print("Prediction for SVM:", "Spam" if svm_model.predict(sample_vect)[0] == 1 else "Ham")


Saving spam.csv to spam.csv
Naive Bayes Classifier Performance:
Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1453
           1       1.00      0.81      0.90       219

    accuracy                           0.98      1672
   macro avg       0.99      0.91      0.94      1672
weighted avg       0.98      0.98      0.97      1672

Support Vector Machine Classifier Performance:
Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1453
           1       0.98      0.86      0.92       219

    accuracy                           0.98      1672
   macro avg       0.98      0.93      0.95      1672
weighted avg       0.98      0.98      0.98      1672

Prediction for Naive Bayes: Spam
Prediction for SVM: Spam
