In [181]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [182]:
df = pd.read_csv('../Spam-Classifier/dataset/spam.csv', encoding='latin1')

In [183]:
# Exploratory Data Analysis
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [184]:
# Shape of the dataset
df.shape

(5572, 5)

In [185]:
# Columns
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [186]:
# Explore the distribution of target variable/classes
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [187]:
# Summarize the statistical properties of numerical features
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [188]:
# Check for missing values (none in this dataset)
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [189]:
# Check datatypes
df.dtypes

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object

In [190]:
# Removing the unnecessary columns/features
df = df.iloc[:, :2]

In [191]:
# Recheck the columns
df.columns

Index(['v1', 'v2'], dtype='object')

In [192]:
df.sample(5)

Unnamed: 0,v1,v2
1113,ham,"No I'm good for the movie, is it ok if I leave..."
4421,ham,MMM ... Fuck .... Merry Christmas to me
3055,ham,Webpage s not available!
5492,spam,Marvel Mobile Play the official Ultimate Spide...
1755,ham,Really good:)dhanush rocks once again:)


In [193]:
# Renaming the columns for better practice
df.columns = ['Category', 'Message']

In [194]:
df.sample(10)

Unnamed: 0,Category,Message
1722,ham,Thought praps you meant another one. Goodo! I'...
1626,ham,Dear how you. Are you ok?
4941,ham,"I'm eatin now lor, but goin back to work soon...."
4845,ham,Pls help me tell Ashley that i cant find her n...
4808,ham,"Don't worry though, I understand how important..."
544,ham,This girl does not stay in bed. This girl does...
3896,ham,No. Thank you. You've been wonderful
109,ham,Dont worry. I guess he's busy.
630,ham,Please dont say like that. Hi hi hi
3539,ham,"I'll get there at 3, unless you guys want me t..."


In [195]:
# Check for missing values
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [196]:
# Split the dataset into features (X) and target variable (y)
X = df['Message']
y = df['Category']

In [197]:
# Shapes of Dependent(X) and Independent(y) features
print("Independent feature shape: ", X.shape)
print("Dependent feature shape: ", y.shape)

Independent feature shape:  (5572,)
Dependent feature shape:  (5572,)


In [198]:
# Split the dataset into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [199]:
# Shapes of Train datsets for Dependent(X) and Independent(y) features
print("Independent feature shape: ", X_train.shape)
print("Dependent feature shape: ", y_train.shape)

Independent feature shape:  (4457,)
Dependent feature shape:  (4457,)


In [200]:
# Shapes of Train datsets for Dependent(X) and Independent(y) features
print("Independent feature shape: ", X_train.shape)
print("Dependent feature shape: ", y_train.shape)

Independent feature shape:  (4457,)
Dependent feature shape:  (4457,)


In [201]:
# Endoding the target feature with LabelEncode
label_encoder = LabelEncoder()

In [202]:
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [203]:
# CountVectorizer converts text data into a matrix of token counts for machine learning.
# Initializing CountVectorizer
count_vectorizer = CountVectorizer()

# Apply CountVectorizer to 'Message' column
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [204]:
# Checking target feature after applying encoder
pd.DataFrame(y_train).sample(10)

Unnamed: 0,0
4249,0
651,1
1939,0
3364,0
1983,0
2142,1
3670,0
2095,0
2818,0
1266,0


In [205]:
X_train_counts

<4457x7735 sparse matrix of type '<class 'numpy.int64'>'
	with 58978 stored elements in Compressed Sparse Row format>

In [206]:
# Initializing the model
clf = MultinomialNB()

In [207]:
# Fit the training set on the model
clf.fit(X_train_counts, y_train)

In [208]:
# Predict the test set (Test set predictions)
y_test_pred = clf.predict(X_test_counts)
label_encoder.inverse_transform(y_test_pred)

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'spam'], dtype=object)

In [209]:
y_test_pred.shape, y_test.shape

((1115,), (1115,))

In [210]:
# Accuracy of the test set prediction
accuracy_score(y_test_pred, y_test)

0.9838565022421525

In [172]:
# Classification Report
pd.DataFrame(classification_report(y_test_pred, y_test, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.997927,0.893333,0.983857,0.94563,0.98517
recall,0.983657,0.985294,0.983857,0.984475,0.983857
f1-score,0.990741,0.937063,0.983857,0.963902,0.984193
support,979.0,136.0,0.983857,1115.0,1115.0


In [173]:
# Prediciton on the training set
y_train_pred = clf.predict(X_train_counts)
label_encoder.inverse_transform(y_train_pred)

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [174]:
y_train_pred.shape, y_train.shape

((4457,), (4457,))

In [175]:
# Accuracy of the test set prediction
accuracy_score(y_train_pred, y_train)

0.9943908458604442

In [176]:
# Classification Report
pd.DataFrame(classification_report(y_test_pred, y_test, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.997927,0.893333,0.983857,0.94563,0.98517
recall,0.983657,0.985294,0.983857,0.984475,0.983857
f1-score,0.990741,0.937063,0.983857,0.963902,0.984193
support,979.0,136.0,0.983857,1115.0,1115.0


In [177]:
# Initialize k-fold cross-validation on the training data
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [178]:
# Perform k-fold cross-validation on the training data
count_vectorizer_scores = cross_val_score(clf, X_train_counts, y_train, cv=kfold)
np.mean(count_vectorizer_scores)

0.9831720292108932

In [179]:
# new data
email = ['''
    You won a reward of $40000
''']

# Predict for the new data
prediction_1 = clf.predict(count_vectorizer.transform(email))
label_encoder.inverse_transform(prediction_1)

array(['spam'], dtype=object)

In [180]:
# new data
email = ['''
    Lets go for a walk!
''']

# Predict for the new data
prediction_1 = clf.predict(count_vectorizer.transform(email))
label_encoder.inverse_transform(prediction_1)

array(['ham'], dtype=object)