In [25]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import re

In [49]:
data=pd.read_csv("spam.csv")

In [50]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [51]:

data["spam"]=data.Category.apply(lambda x: 1 if x=="spam" else 0)
data.drop("Category",axis=1)

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [52]:
data

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [53]:
emails = data["Message"]
labels=data["spam"]

In [29]:
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    return text

In [45]:
emails = [preprocess(email) for email in emails]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.3, random_state=42)

In [32]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [33]:
# Handle class imbalance with SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train_vectorized, y_train)

In [34]:
model = MultinomialNB()
model.fit(X_resampled, y_resampled)

MultinomialNB()

In [35]:
y_pred = model.predict(X_test_vectorized)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1448
           1       0.89      0.95      0.92       224

    accuracy                           0.98      1672
   macro avg       0.94      0.97      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [37]:
emails=["Don't miss out on our amazing sale! Buy one, get one free on all items!",
       "Reminder: Your appointment is scheduled for tomorrow at 2 PM. See you then!"]

In [38]:
email_counter=vectorizer.transform(emails)
model.predict(email_counter)

array([1, 0], dtype=int64)

In [39]:
y_proba = model.predict_proba(X_test_vectorized)

In [40]:
# Adjust decision threshold (e.g., 0.4 instead of 0.5)
threshold = 0.4
y_pred_adjusted = (y_proba[:, 1] > threshold).astype(int)


In [41]:
# Evaluate with the adjusted threshold
print("Classification Report (Threshold 0.4):")
print(classification_report(y_test, y_pred_adjusted))

Classification Report (Threshold 0.4):
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1448
           1       0.79      0.96      0.87       224

    accuracy                           0.96      1672
   macro avg       0.89      0.96      0.92      1672
weighted avg       0.97      0.96      0.96      1672

