In [1]:
# Import required libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import tkinter as tk
from tkinter import messagebox


In [2]:
# Download stopwords (only need to run once)
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Load the email dataset
df = pd.read_csv("/Email.csv", encoding="latin-1")[["text", "spam"]]
df.columns = ["message", "label"]  # Rename columns for better understanding
df.head()


Unnamed: 0,message,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
# Set up stopwords and stemmer
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))


In [6]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r"\W", " ", text)          # Remove non-word characters
    text = text.lower()                      # Lowercase
    words = text.split()                     # Tokenize
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stopwords and stem
    return " ".join(words)


In [7]:
# Apply preprocessing to messages
df["cleaned_message"] = df["message"].apply(preprocess_text)
df.head()


Unnamed: 0,message,label,cleaned_message
0,Subject: naturally irresistible your corporate...,1,subject natur irresist corpor ident lt realli ...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trade gunsling fanni merril muzo...
2,Subject: unbelievable new homes made easy im ...,1,subject unbeliev new home made easi im want sh...
3,Subject: 4 color printing special request add...,1,subject 4 color print special request addit in...
4,"Subject: do not have money , get software cds ...",1,subject money get softwar cd softwar compat gr...


In [8]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df["cleaned_message"])

# Target variable
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)


In [9]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [10]:
# Predict and evaluate the model
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(classification_report(y_test, y_pred))


Accuracy: 98.43%
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       1.00      0.94      0.97       290

    accuracy                           0.98      1146
   macro avg       0.99      0.97      0.98      1146
weighted avg       0.98      0.98      0.98      1146



In [11]:
# Function to predict whether a new email is spam or ham
def predict_email(email_text):
    processed_data = preprocess_text(email_text)
    vectorized_text = vectorizer.transform([processed_data])
    prediction = model.predict(vectorized_text)
    return "Spam" if prediction[0] == 1 else "Ham - Not Spam"


In [14]:
# Install ipywidgets if not already installed
!pip install ipywidgets

import ipywidgets as widgets
from IPython.display import display

# Create input text area
email_input = widgets.Textarea(
    placeholder='Enter email content here...',
    description='Email:',
    layout=widgets.Layout(width='600px', height='150px')
)

# Create a button
predict_button = widgets.Button(
    description='Predict Spam',
    button_style='success'
)

# Create output area
output = widgets.Output()

# Function to predict and display result
def on_button_click(b):
    with output:
        output.clear_output()
        email_text = email_input.value
        if email_text.strip() == "":
            print("⚠️ Please enter an email text to classify.")
        else:
            prediction = predict_email(email_text)
            print(f"📨 Prediction: {prediction}")

# Link button to function
predict_button.on_click(on_button_click)

# Display everything
display(email_input, predict_button, output)


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


Textarea(value='', description='Email:', layout=Layout(height='150px', width='600px'), placeholder='Enter emai…

Button(button_style='success', description='Predict Spam', style=ButtonStyle())

Output()

In [13]:
# Launch the Spam Classifier GUI
create_gui()


TclError: no display name and no $DISPLAY environment variable