In [13]:
# Importing required libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample

# Step 1: Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Step 2: Data Preprocessing
# Keep only necessary columns and rename them
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Convert labels to numeric format (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check class distribution
print("Class distribution before balancing:")
print(df['label'].value_counts())

# Balance the dataset if needed
spam = df[df['label'] == 1]
ham = df[df['label'] == 0]

if len(spam) < len(ham):
    spam_upsampled = resample(spam, replace=True, n_samples=len(ham), random_state=42)
    df = pd.concat([ham, spam_upsampled])
elif len(spam) > len(ham):
    ham_upsampled = resample(ham, replace=True, n_samples=len(spam), random_state=42)
    df = pd.concat([spam, ham_upsampled])

print("Class distribution after balancing:")
print(df['label'].value_counts())

# Text preprocessing: clean the messages
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['message'] = df['message'].apply(clean_text)

# Step 3: Train-Test Split
X = df['message']  # Features: email/text message
y = df['label']  # Labels: 0 (ham) or 1 (spam)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Text Vectorization
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=5000)
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

# Step 5: Train the Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_cv, y_train)

# Step 6: Make Predictions on the Test Data
y_pred = model.predict(X_test_cv)

# Step 7: Evaluate the Model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Step 8: Function to classify user input (spam or ham)
def classify_email(text):
    # Preprocess the input text
    cleaned_text = clean_text(text)

    # Vectorize the input text
    text_cv = vectorizer.transform([cleaned_text])

    # Predict using the trained model
    prediction = model.predict(text_cv)

    # Convert the numeric prediction back to 'ham' or 'spam'
    return 'spam' if prediction[0] == 1 else 'ham'

# Step 9: Accept user input for classification
user_input = input("Enter the text to classify as spam or ham: ")
result = classify_email(user_input)
print(f"The given text is: {result}")


Class distribution before balancing:
label
0    4825
1     747
Name: count, dtype: int64
Class distribution after balancing:
label
0    4825
1    4825
Name: count, dtype: int64
Classification Report:
              precision    recall  f1-score   support

         ham       0.95      0.98      0.96       965
        spam       0.98      0.95      0.96       965

    accuracy                           0.96      1930
   macro avg       0.96      0.96      0.96      1930
weighted avg       0.96      0.96      0.96      1930

Accuracy: 0.96
Enter the text to classify as spam or ham: Hurry up! Get a FREE iPhone 14 today! Limited stock available. Click here to claim yours: http://freeiphone.com
The given text is: spam
