In [56]:
# Importing essential libraries for data handling, text preprocessing,
# feature extraction, model training, and evaluation


import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [57]:
import random

# Updated Spam and Ham examples
spam_phrases = [
    "You've been chosen for an exclusive investment opportunity!",
    "Alert: Unusual login detected. Secure your account now.",
    "Earn cash daily with zero experience. Start today!",
    "Winner! Your email was selected for a prize.",
    "Hot deal! 90% off on premium gadgets for 24 hours.",
    "Re: Your invoice has a refund. Click to process.",
    "Final notice! Your subscription will expire soon.",
    "Claim your airline voucher before it expires.",
    "Act fast! Limited slots for free crypto training.",
    "Get your free smartwatch—just pay shipping!"
]

ham_phrases = [
    "Can we reschedule the call to tomorrow afternoon?",
    "I uploaded the project files to the shared folder.",
    "Thanks for your help earlier, really appreciate it!",
    "Will you be joining the team dinner tonight?",
    "The presentation has been moved to 11 am.",
    "Just parked outside your building. Coming up now.",
    "Let’s review the code changes after lunch.",
    "Had a great time at the reunion—let’s catch up again!",
    "Meeting went well. I'll send you the summary notes.",
    "Heading out for a walk. Need a break from the screen."
]

# Generate 5000 spam and 5000 ham messages
data = [("spam", random.choice(spam_phrases)) for _ in range(5000)] + \
       [("ham", random.choice(ham_phrases)) for _ in range(5000)]

random.shuffle(data)

# Create DataFrame and save
df = pd.DataFrame(data, columns=["label", "message"])
df.to_csv("synthetic_spam_dataset_10000.csv", index=False)

print("Saved as 'synthetic_spam_dataset_10000.csv'")


Saved as 'synthetic_spam_dataset_10000.csv'


In [58]:
df

Unnamed: 0,label,message
0,spam,Get your free smartwatch—just pay shipping!
1,ham,The presentation has been moved to 11 am.
2,spam,Act fast! Limited slots for free crypto training.
3,spam,Hot deal! 90% off on premium gadgets for 24 ho...
4,ham,Will you be joining the team dinner tonight?
...,...,...
9995,spam,Winner! Your email was selected for a prize.
9996,spam,Get your free smartwatch—just pay shipping!
9997,ham,Will you be joining the team dinner tonight?
9998,ham,Let’s review the code changes after lunch.


In [59]:
# 2. Text cleaning function
def clean_text(text):
    import re
    import string

    lowered = text.lower()
    no_digits = re.sub(r'\d+', '', lowered)
    no_punct = ''.join(char for char in no_digits if char not in string.punctuation)
    cleaned = ' '.join(no_punct.split())
    return cleaned


In [60]:
df['cleaned_message'] = df['message'].apply(clean_text)


In [61]:
# 3. Convert text into TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df.cleaned_message.values.tolist())
y = df.label.apply(lambda x: 0 if x == 'ham' else 1)


In [62]:
# 4. Split dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)


In [63]:
# 5. Train the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)


In [64]:
# 6. Evaluate the model# 6. Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))


Accuracy: 1.0000
Classification Report:

              precision    recall  f1-score   support

         Ham       1.00      1.00      1.00      1002
        Spam       1.00      1.00      1.00       998

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         Ham       1.00      1.00      1.00      1002
        Spam       1.00      1.00      1.00       998

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [65]:
# 7. Function to predict new messages
def predict_spam(message):
    cleaned = clean_text(message)
    vec = vectorizer.transform([cleaned])
    result = model.predict(vec)[0]
    return "Spam" if result == 1 else "Ham"

In [66]:
# 8. Example predictions
print(predict_spam("Congratulations! You have won a free cruise trip."))
print(predict_spam("Hey, don't forget our dinner plan at 8."))
print(predict_spam("Urgent! Please verify your account immediately."))

Spam
Ham
Spam


In [67]:
print("Accuracy:", accuracy_score(y_test, model.predict(X_test)))


Accuracy: 1.0


In [68]:
# Save Model
import joblib

# Save the model to a file
joblib.dump(model, "Spam_mail_Classifier.joblib")

print("Model saved successfully as Spam_mail_Classifier.joblib")

Model saved successfully as Spam_mail_Classifier.joblib


In [69]:

from google.colab import files
files.download('Spam_mail_Classifier.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [70]:
import os
os.listdir()


['.config',
 'synthetic_spam_model.pkl',
 'synthetic_spam_dataset_10000.csv',
 'Spam_mail_Classifier.joblib',
 'sample_data']

In [71]:
print(model.get_params())


{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}
