In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load dataset
data = pd.read_csv("./spam.csv", encoding="latin-1")

# Step 2: Preprocess data
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

print("Dataset shape:", data.shape)
print(data.head())

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['message'], data['label'], test_size=0.3, random_state=42
)

# Step 4: Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test_tfidf)

# Step 7: Evaluate model
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 8: Test with custom emails
test_emails = [
    "Congratulations! You won a lottery of $5000. Claim now.",
    "Are you coming to the meeting tomorrow?",
    "Lowest interest loans available. Apply now!"
]

test_features = vectorizer.transform(test_emails)
predictions = model.predict(test_features)

print("\nTest Predictions:")
for email, label in zip(test_emails, predictions):
    print(f"Email: {email} -> Prediction: {label}")


Dataset shape: (5572, 2)
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Accuracy: 0.9754784688995215

Confusion Matrix:
 [[1453    0]
 [  41  178]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1453
        spam       1.00      0.81      0.90       219

    accuracy                           0.98      1672
   macro avg       0.99      0.91      0.94      1672
weighted avg       0.98      0.98      0.97      1672


Test Predictions:
Email: Congratulations! You won a lottery of $5000. Claim now. -> Prediction: spam
Email: Are you coming to the meeting tomorrow? -> Prediction: ham
Email: Lowest interest l