In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
# Load Dataset

df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Data Cleaning

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['cleaned'] = df['message'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [4]:
# Encode Labels

df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

In [5]:
# Text Vectorization

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned']).toarray()
y = df['label_num']


In [6]:
# Split Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Train Model

model = MultinomialNB()
model.fit(X_train, y_train)


In [8]:
# Evaluate Model

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9775784753363229
[[965   0]
 [ 25 125]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [9]:
# Test with Custom Input

def predict_spam(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    pred = model.predict(vectorized)[0]
    return "Spam" if pred == 1 else "Not Spam"

print(predict_spam("Congratulations! You've won a $1000 Walmart gift card. Click here to claim!"))
print(predict_spam("Hey Addy, are we still meeting tomorrow?"))


Spam
Not Spam


In [10]:
# GUI using Tkinter

import tkinter as tk

def detect():
    msg = entry.get("1.0",'end-1c')
    result = predict_spam(msg)
    output_label.config(text=result, fg="red" if result=="Spam" else "green")

root = tk.Tk()
root.title("Spam Mail Detector")

tk.Label(root, text="Enter Email/Text:").pack()
entry = tk.Text(root, height=5, width=50)
entry.pack()
tk.Button(root, text="Detect", command=detect).pack()
output_label = tk.Label(root, text="", font=('Helvetica', 14))
output_label.pack()
root.mainloop()


In [2]:
#1Ô∏è‚É£ spam_detector.py ‚Äî Train and Save Model

import pandas as pd
import numpy as np
import string
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')

# -----------------------
# Load Dataset
# -----------------------
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# -----------------------
# Clean Text
# -----------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['cleaned'] = df['message'].apply(clean_text)
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# -----------------------
# Vectorization
# -----------------------
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned']).toarray()
y = df['label_num']

# -----------------------
# Train-Test Split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------
# Train Model
# -----------------------
model = MultinomialNB()
model.fit(X_train, y_train)

# -----------------------
# Evaluate
# -----------------------
y_pred = model.predict(X_test)
print("‚úÖ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# -----------------------
# Save Model and Vectorizer
# -----------------------
import os
#os.makedirs('Spam_Email_Detection_Project', exist_ok=True)
pickle.dump(model, open(r"C:\Users\janga\ML_Projects\Spam_Email_Detection_Project\spam_model.pkl", 'wb'))
pickle.dump(vectorizer, open(r"C:\Users\janga\ML_Projects\Spam_Email_Detection_Project\vectorizer.pkl", 'wb'))

print("\nüéâ Model and vectorizer saved successfully!")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


‚úÖ Accuracy: 0.9775784753363229

Confusion Matrix:
 [[965   0]
 [ 25 125]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


üéâ Model and vectorizer saved successfully!


In [2]:
# 2Ô∏è‚É£ app.py ‚Äî Streamlit Web App

import streamlit as st
import pickle
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# -----------------------
# Load Model and Vectorizer
# -----------------------
model = pickle.load(open('model/spam_model.pkl', 'rb'))
vectorizer = pickle.load(open('model/vectorizer.pkl', 'rb'))

# -----------------------
# Text Cleaning Function
# -----------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# -----------------------
# Streamlit UI
# -----------------------
st.set_page_config(page_title="Spam Mail Detector", page_icon="üìß", layout="centered")

st.title("üì© Spam Mail Detection System")
st.write("Enter an email or SMS message below to detect whether it‚Äôs Spam or Not Spam:")

user_input = st.text_area("Enter your message:", height=150)

if st.button("Detect Spam"):
    if user_input.strip() == "":
        st.warning("‚ö†Ô∏è Please enter a message first!")
    else:
        cleaned = clean_text(user_input)
        vectorized = vectorizer.transform([cleaned])
        prediction = model.predict(vectorized)[0]

        if prediction == 1:
            st.error("üö´ This message is **SPAM**!")
        else:
            st.success("‚úÖ This message is **NOT SPAM**!")

st.markdown("---")
st.caption("Built with ‚ù§Ô∏è using Python, NLP, and Streamlit")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-11-03 16:47:47.202 
  command:

    streamlit run C:\Users\janga\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-11-03 16:47:47.209 Session state does not function when running a script without `streamlit run`


DeltaGenerator()