In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [22]:
df = pd.read_csv("cleaned_spam_dataset.csv")


In [7]:
print("‚úÖ Loaded dataset:", df.shape)
print(df.head(3))

‚úÖ Loaded dataset: (10852, 3)
   spam                                         clean_text  \
0     1  natur irresist corpor ident lt realli hard rec...   
1     1  stock trade gunsling fanni merril muzo colza a...   
2     1  unbeliev new home made easi im want show homeo...   

                                              tokens  
0  ['naturally', 'irresistible', 'your', 'corpora...  
1  ['the', 'stock', 'trading', 'gunslinger', 'fan...  
2  ['unbelievable', 'new', 'homes', 'made', 'easy...  


In [12]:
df.dropna(inplace=True)

In [13]:
X = df["clean_text"]
y = df["spam"]

# =========================================
# 2Ô∏è‚É£ Split Data
# =========================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

Train samples: 8676, Test samples: 2170


In [14]:
df.isnull().sum()

spam          0
clean_text    0
tokens        0
dtype: int64

In [17]:
df.shape

(10846, 3)

In [15]:
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("TF-IDF vectorization complete.")
print("Feature matrix shape:", X_train_tfidf.shape)

TF-IDF vectorization complete.
Feature matrix shape: (8676, 3000)


In [18]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
print("‚úÖ Model training complete.")

‚úÖ Model training complete.


In [19]:
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nüìä Model Evaluation")
print("-------------------")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


üìä Model Evaluation
-------------------
Accuracy: 0.9668
F1 Score: 0.9037

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1768
           1       0.98      0.84      0.90       402

    accuracy                           0.97      2170
   macro avg       0.97      0.92      0.94      2170
weighted avg       0.97      0.97      0.97      2170



In [21]:
with open("spam_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

print("\nüíæ Model and vectorizer saved successfully in /models folder!")


üíæ Model and vectorizer saved successfully in /models folder!


In [23]:
# Example emails to test
sample_emails = [
    "Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize!",
    "Hi John, just wanted to check if we‚Äôre still meeting at 3 PM today?",
    "Earn money fast! Work from home and get paid $5000 per week. Apply now!",
    "Dear customer, your account has been suspended. Please verify your details immediately.",
    "Hey, can you send me the project report before tomorrow‚Äôs meeting?"
]

# Convert to TF-IDF vectors
sample_tfidf = vectorizer.transform(sample_emails)

# Predict spam or not
predictions = model.predict(sample_tfidf)

# Display results
for text, label in zip(sample_emails, predictions):
    print("üìß", text)
    print("‚û°Ô∏è Prediction:", "üö´ SPAM" if label == 1 else "‚úÖ NOT SPAM")
    print("-" * 80)


üìß Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize!
‚û°Ô∏è Prediction: üö´ SPAM
--------------------------------------------------------------------------------
üìß Hi John, just wanted to check if we‚Äôre still meeting at 3 PM today?
‚û°Ô∏è Prediction: ‚úÖ NOT SPAM
--------------------------------------------------------------------------------
üìß Earn money fast! Work from home and get paid $5000 per week. Apply now!
‚û°Ô∏è Prediction: üö´ SPAM
--------------------------------------------------------------------------------
üìß Dear customer, your account has been suspended. Please verify your details immediately.
‚û°Ô∏è Prediction: ‚úÖ NOT SPAM
--------------------------------------------------------------------------------
üìß Hey, can you send me the project report before tomorrow‚Äôs meeting?
‚û°Ô∏è Prediction: ‚úÖ NOT SPAM
--------------------------------------------------------------------------------


In [24]:
import pickle

# Load model and vectorizer
with open("spam_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("tfidf_vectorizer.pkl", "rb") as vec_file:
    vectorizer = pickle.load(vec_file)

# Example emails
new_emails = [
    "You have been selected for a free vacation to Hawaii! Click to claim!",
    "Hey team, please find attached the updated report for tomorrow‚Äôs review."
]

# Transform & predict
new_tfidf = vectorizer.transform(new_emails)
preds = model.predict(new_tfidf)

for mail, pred in zip(new_emails, preds):
    print("üìß", mail)
    print("‚û°Ô∏è", "üö´ SPAM" if pred == 1 else "‚úÖ NOT SPAM")
    print("-" * 70)


üìß You have been selected for a free vacation to Hawaii! Click to claim!
‚û°Ô∏è üö´ SPAM
----------------------------------------------------------------------
üìß Hey team, please find attached the updated report for tomorrow‚Äôs review.
‚û°Ô∏è ‚úÖ NOT SPAM
----------------------------------------------------------------------
