In [2]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [5]:
df=pd.read_csv('mail_data.csv')
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned'] = df['message'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [8]:
X = df['cleaned']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC()
}

In [14]:
import pickle

best_accuracy = 0
best_model = None
best_model_name = ""

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test, y_pred)
    
    print(f"\nðŸ“Š Model: {name}")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Save the best model
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

# Save the best model and TF-IDF vectorizer
if best_model:
    with open("spam_model.pkl", "wb") as f:
        pickle.dump(best_model, f)
    with open("tfidf_vectorizer.pkl", "wb") as f:
        pickle.dump(tfidf, f)
    print(f"\nâœ… Best model '{best_model_name}' saved with accuracy: {best_accuracy:.4f}")


ðŸ“Š Model: Naive Bayes
Accuracy: 0.9695
Confusion Matrix:
 [[966   0]
 [ 34 115]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


ðŸ“Š Model: Logistic Regression
Accuracy: 0.9587
Confusion Matrix:
 [[963   3]
 [ 43 106]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.97      0.71      0.82       149

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115


ðŸ“Š Model: SVM
Accuracy: 0.9892
Confusion Matrix:
 [[965   1]
 [ 11 138]]
Classification Report:
               precisio

In [15]:
def predict_spam(message, model):
    msg = preprocess(message)
    msg_tfidf = tfidf.transform([msg])
    result = model.predict(msg_tfidf)
    return "Spam" if result[0] == 1 else "Ham"

# Example
sample_msg = "Congratulations! You've won a free ticket to Bahamas!"
chosen_model = models['Logistic Regression']
print("Prediction:", predict_spam(sample_msg, chosen_model))


Prediction: Ham
