In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [3]:
data = pd.read_csv("spam_ham_dataset.csv", encoding='latin-1')
# The DataFrame likely has columns named 'Unnamed: 0', 'label', 'text', and 'Unnamed: 4'.
# Adjust column selection accordingly.
data = data[['label', 'text']]
# Rename the columns to 'label' and 'message'.
data.columns = ['label', 'message']

data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data['message'], data['label'], test_size=0.2, random_state=42)

# Feature extraction
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       742
           1       0.89      0.93      0.91       293

    accuracy                           0.95      1035
   macro avg       0.93      0.94      0.93      1035
weighted avg       0.95      0.95      0.95      1035



In [None]:
# User input for classification
def classify_email():
    user_input = input("Enter an email message: ")
    user_input_tfidf = vectorizer.transform([user_input])
    prediction = model.predict(user_input_tfidf)[0]
    print("Spam" if prediction == 1 else "Not Spam")

# Run classification for user input
classify_email()