In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string


In [2]:

df = pd.read_csv("spam.csv", encoding='ISO-8859-1')

df.head()

df = df.drop_duplicates()

df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True)

df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True)



In [3]:
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

df.drop_duplicates(inplace=True)

print(df.shape)



(5169, 3)


In [4]:
nltk.download("stopwords")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hassan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

df['Message'].head().apply(process)



0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Message, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(df['Message'])

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['Spam'], test_size=0.20, random_state=0)

print(message.shape)


(5169, 11304)


In [7]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)

print("Predictions on training data:", classifier.predict(xtrain))

print("Actual training labels:", ytrain.values)


Predictions on training data: [0 0 0 ... 0 0 0]
Actual training labels: [0 0 0 ... 0 0 0]


In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred_train = classifier.predict(xtrain)
print("Classification Report (Training Data):\n", classification_report(ytrain, pred_train))
print("Confusion Matrix (Training Data):\n", confusion_matrix(ytrain, pred_train))
print("Accuracy (Training Data):\n", accuracy_score(ytrain, pred_train))

pred_test = classifier.predict(xtest)

print("Predictions on test data:", pred_test)

print("Actual test labels:", ytest.values)

print("Classification Report (Test Data):\n", classification_report(ytest, pred_test))
print("Confusion Matrix (Test Data):\n", confusion_matrix(ytest, pred_test))
print("Accuracy (Test Data):\n", accuracy_score(ytest, pred_test))


Classification Report (Training Data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3631
           1       0.98      0.98      0.98       504

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135

Confusion Matrix (Training Data):
 [[3623    8]
 [  11  493]]
Accuracy (Training Data):
 0.9954050785973397
Predictions on test data: [0 0 0 ... 0 0 0]
Actual test labels: [0 0 0 ... 0 0 0]
Classification Report (Test Data):
               precision    recall  f1-score   support

           0       0.99      0.96      0.97       885
           1       0.80      0.93      0.86       149

    accuracy                           0.96      1034
   macro avg       0.89      0.94      0.92      1034
weighted avg       0.96      0.96      0.96      1034

Confusion Matrix (Test Data):
 [[850  35]
 [ 11 138]]
Accuracy (Test Data):
 0.9

In [9]:
sample_messages = ["Congratulations! You've won a free ticket to Bahamas. Call now to claim.", 
                  "Can you send me the report by tomorrow morning?"]
sample_data = CountVectorizer(analyzer=process).fit(df['Message']) 
sample_transformed = sample_data.transform(sample_messages)

sample_predictions = classifier.predict(sample_transformed)

for i, message in enumerate(sample_messages):
    print(f"Message: {message}")
    print(f"Prediction: {'Spam' if sample_predictions[i] == 1 else 'Not Spam'}")
    print()

Message: Congratulations! You've won a free ticket to Bahamas. Call now to claim.
Prediction: Spam

Message: Can you send me the report by tomorrow morning?
Prediction: Not Spam

