In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
import re
import string
from collections import Counter

print("✓ All libraries imported successfully!\n")

✓ All libraries imported successfully!



In [2]:
df = pd.read_csv(r'C:\Users\USER\Documents\email_data.csv')

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# Map 'ham' to 0 and 'spam' to 1
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})

# Define features (X) and target (y)
X = df['Message']
y = df['Label']

In [10]:
# Text cleaning function
def clean_text(text):
    """
    Clean and preprocess text data
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\d{5,}', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply text cleaning
print("\nCleaning text data...")
df['Clean_Message'] = df['Message'].apply(clean_text)
print("✓ Text cleaning complete!")
print(f"\nExample - Original vs Cleaned:")
print(f"Original: {df['Message'].iloc[3]}")
print(f"Cleaned: {df['Clean_Message'].iloc[3]}")


Cleaning text data...
✓ Text cleaning complete!

Example - Original vs Cleaned:
Original: U dun say so early hor... U c already then say...
Cleaned: u dun say so early hor u c already then say


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform the text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [14]:
# Initialize and train the classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

In [15]:
# Make predictions
y_pred = clf.predict(X_test_tfidf)

# Print accuracy and detailed report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9767

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [16]:
def predict_spam(text):
    text_tfidf = vectorizer.transform([text])
    prediction = clf.predict(text_tfidf)
    return "Spam" if prediction[0] == 1 else "Ham"

# Example tests
print(f"Result 1: {predict_spam('Congratulations! You won a $1,000 Walmart gift card. Click here to claim.')}")
print(f"Result 2: {predict_spam('Hey, are we still meeting for lunch at 12?')}")

Result 1: Spam
Result 2: Ham


In [17]:
# 1. Load the new emails
new_data =pd.read_csv(r'C:\Users\USER\Documents\email_data.csv')

# 2. Transform the text using the ALREADY fitted vectorizer
# Important: Use .transform(), NOT .fit_transform()
new_emails_tfidf = vectorizer.transform(new_data['Message'])

# 3. Predict the categories
predictions = clf.predict(new_emails_tfidf)

# 4. Add the results back to your dataframe
new_data['Prediction'] = predictions
new_data['Label'] = new_data['Prediction'].map({0: 'ham', 1: 'spam'})

# 5. Save or view the results
print(new_data.head())
new_data.to_csv('classified_emails.csv', index=False)

  Category                                            Message  Prediction  \
0      ham  Go until jurong point, crazy.. Available only ...           0   
1      ham                      Ok lar... Joking wif u oni...           0   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...           1   
3      ham  U dun say so early hor... U c already then say...           0   
4      ham  Nah I don't think he goes to usf, he lives aro...           0   

  Label  
0   ham  
1   ham  
2  spam  
3   ham  
4   ham  
