In [60]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
df = pd.read_csv('../Spam-Classifier/dataset/spam.csv', encoding='latin1')

In [62]:
# Exploratory Data Analysis
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [63]:
# Shape of the dataset
df.shape

(5572, 5)

In [64]:
# Columns
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [65]:
# Explore the distribution of target variable/classes
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [66]:
# Summarize the statistical properties of numerical features
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [67]:
# Check for missing values (none in this dataset)
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [68]:
# Check datatypes
df.dtypes

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object

In [69]:
# Removing the unnecessary columns/features
df = df.iloc[:, :2]

In [70]:
# Recheck the columns
df.columns

Index(['v1', 'v2'], dtype='object')

In [71]:
df.sample(5)

Unnamed: 0,v1,v2
4472,ham,S but not able to sleep.
5063,ham,I dunno lei... Like dun haf...
4432,ham,Can u look 4 me in da lib i got stuff havent f...
4660,ham,Ok no prob...
890,ham,Why do you ask princess?


In [72]:
# Renaming the columns for better practice
df.columns = ['Category', 'Message']

In [73]:
df.sample(10)

Unnamed: 0,Category,Message
4070,ham,I've reached home n i bathe liao... U can call...
3789,ham,I love you !!! You know? Can you feel it? Does...
2023,ham,Is there any movie theatre i can go to and wat...
81,ham,K. Did you call me just now ah?
4769,ham,CHEERS LOU! YEAH WAS A GOODNITE SHAME U NEVA C...
1646,ham,Thts wat Wright Brother did to fly..
3340,ham,Still i have not checked it da. . .
1726,ham,\ALRITE HUNNY!WOT U UP 2 2NITE? DIDNT END UP G...
5569,ham,"Pity, * was in mood for that. So...any other s..."
5131,ham,That sucks. I'll go over so u can do my hair. ...


In [74]:
# Split the dataset into features (X) and target variable (y)
X = df['Message']
y = df['Category']

In [75]:
# Shapes of Dependent(X) and Independent(y) features
print("Independent feature shape: ", X.shape)
print("Dependent feature shape: ", y.shape)

Independent feature shape:  (5572,)
Dependent feature shape:  (5572,)


In [76]:
# Split the dataset into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [77]:
# Shapes of Train datsets for Dependent(X) and Independent(y) features
print("Independent feature shape: ", X_train.shape)
print("Dependent feature shape: ", y_train.shape)

Independent feature shape:  (4457,)
Dependent feature shape:  (4457,)


In [78]:
# Shapes of Train datsets for Dependent(X) and Independent(y) features
print("Independent feature shape: ", X_train.shape)
print("Dependent feature shape: ", y_train.shape)

Independent feature shape:  (4457,)
Dependent feature shape:  (4457,)


In [79]:
# Endoding the target feature with LabelEncode
label_encoder = LabelEncoder()

In [80]:
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [81]:
# CountVectorizer converts text data into a matrix of token counts for machine learning.
# Initializing CountVectorizer
cv = CountVectorizer()

# Apply CountVectorizer to 'Message' column
X_train_counts = cv.fit_transform(X_train)
X_test_counts = cv.transform(X_test)

In [82]:
# Checking target feature after applying encoder
pd.DataFrame(y_train).sample(10)

Unnamed: 0,0
1749,0
97,0
1062,0
2310,0
167,0
3631,0
4003,0
697,0
4344,0
1679,0


In [83]:
X_train_counts

<4457x7735 sparse matrix of type '<class 'numpy.int64'>'
	with 58978 stored elements in Compressed Sparse Row format>

In [85]:
# Initializing the model
clf = MultinomialNB()