In [51]:
# Importing important libs
import numpy as np  # Used for numerical operations
import pandas as pd  # Used for handling and processing data in tabular format
import re  # Regular expressions for text cleaning

# Import NLP tools from NLTK
from nltk.corpus import stopwords  # To remove common stopwords (e.g., "is", "the", "and")
from nltk.stem import WordNetLemmatizer  # To lemmatize words (e.g., "running" → "run")

# Import machine learning and text processing libraries
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# CountVectorizer: Converts text into word frequency counts
# TfidfTransformer: Converts word counts to TF-IDF scores
# TfidfVectorizer: Directly applies TF-IDF transformation in one step

from sklearn.linear_model import LogisticRegression  # Logistic Regression model for classification
from sklearn.metrics import accuracy_score, classification_report  # Evaluation metrics

# Import SMOTE for handling imbalanced datasets
from imblearn.over_sampling import SMOTE  # Used to balance dataset by generating synthetic samples for minority class

In [52]:
# Load the  dataset
df = pd.read_csv("mail_data.csv")

# Handle missing values (replace NaN with an empty string)
df = df.where(pd.notnull(df), '')

# Convert Category column to binary class where spam = 1, ham = 0
df['Category'] = df['Category'].map({'spam': 1, 'ham': 0})

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) - {'you', 'for', 'it'}  # Keep all  key stopwords

# Function to clean text
def clean_text(text):
    text = text.lower()  # Converts all text  to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters, but keep numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Removes extra spaces from the text
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply text cleaning to the  messages
df['clean_msg'] = df['Message'].apply(clean_text)

# Convert text into numerical features using TF-IDF (with bigrams)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(df['clean_msg'])

# Target labels (Spam = 1, Ham = 0)
y = df['Category']

# Balance dataset using SMOTE (fix spam underrepresentation)
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train model using Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model performance
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Training Accuracy: 0.9955958549222798
Testing Accuracy: 0.9932642487046632

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       990
           1       0.99      0.99      0.99       940

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



In [53]:
# Function to predict a new email
def predict_email(email):
    email_cleaned = clean_text(email)  # Apply same preprocessing
    email_tfidf = vectorizer.transform([email_cleaned])  # Convert to TF-IDF
    prediction = model.predict(email_tfidf)  # Get prediction
    return "Spam" if prediction[0] == 1 else "Not Spam"

# Test with a user input email
input_mail = input("Enter an email to classify: ")
print("Prediction:", predict_email(input_mail))


Enter an email to classify: You Have a New Friend Request!Someone is eager to connect with you.It's Time to Connect! Click below to either accept or ignore the friend request from this user. The choice is yours! Accept Request Ignore Request If you didn't make this request feel free to disregard this email
Prediction: Spam
