In [1]:
import pandas as pd
import urllib.request
import zipfile
import os

# Download and extract the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
zip_path = "smsspamcollection.zip"
data_dir = "sms_data"

# Download if not already present
if not os.path.exists(zip_path):
    urllib.request.urlretrieve(url, zip_path)

# Extract
if not os.path.exists(data_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

# Load dataset
df = pd.read_csv(os.path.join(data_dir, 'SMSSpamCollection'), sep='\t', names=['label', 'message'])
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert labels spam= 1, ham=0
df['label_num'] = df['label'].map({'ham': 0,  'spam': 1})

X = df['message']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                                                   
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)           

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
      

[[966   0]
 [ 34 115]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [4]:
def predict_message(msg):
    vec_msg = vectorizer.transform([msg])
    prediction = model.predict(vec_msg)[0]
    return "SPAM" if prediction ==  1 else "NOT SPAM"

print(predict_message("Congratulations! You've been selected for a $1000 gift card! Click here to claim now."))
print(predict_message("Are we still meeting at 5 PM?"))
print(predict_message("Hey, are we still on for lunch today?"))


NOT SPAM
NOT SPAM
NOT SPAM


In [5]:
def predict_with_confidence(msg):
    vec = vectorizer.transform([msg]) 

    # Ensure scalar outputs
    prediction_array = model.predict(vec)
    prediction = int(prediction_array[0])
    
    prob_array = model.predict_proba(vec)[0]
    confidence = float(prob_array[prediction])

    label = 'SPAM' if prediction == 1 else "NOT SPAM"
    return label, round(confidence * 100, 2)

label, confidence = predict_with_confidence("Free tickets now! Click the link.")
print(f"{label} (confidence: {confidence}%)")

NOT SPAM (confidence: 71.52%)


In [None]:
import tkinter as tk
from tkinter import messagebox

def run_gui():
    def on_predict():
        message = input_text.get("1.0", tk.END).strip()
        if not message:
            messagebox.showwarning("Input Needed", "Please enter a message to classify.")
            return
        label, confidence = predict_with_confidence(message)
        result_label.config(text=f"{label} ({confidence}%)", fg="green" if label == "NOT SPAM" else "red")

    # Create window
    root = tk.Tk()
    root.title("SMS Spam Classifier")
    root.geometry("500x300")

    # Input box
    tk.Label(root, text="Enter your message:", font=("Helvetica", 12)).pack(pady=10)
    input_text = tk.Text(root, height=5, width=50)
    input_text.pack()

    # Predict button
    tk.Button(root, text="Predict", command=on_predict, font=("Helvetica", 12), bg="blue", fg="white").pack(pady=10)

    # Output label
    result_label = tk.Label(root, text="", font=("Helvetica", 14, "bold"))
    result_label.pack(pady=10)

    root.mainloop()

run_gui()