In [108]:
import pandas as pd
import numpy as np

# data preprocessing

In [109]:
data = pd.read_csv('spam.csv')
print(data.shape)
data.head()

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [110]:
data.dtypes

Category    object
Message     object
dtype: object

In [111]:
data.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [112]:
data['Spam'] = data.Category.apply(lambda x: 1 if x == 'spam' else 0)
data['Spam'].iloc[0:10]

0    0
1    0
2    1
3    0
4    0
5    1
6    0
7    0
8    1
9    1
Name: Spam, dtype: int64

In [113]:
data.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Train Test Split

In [114]:
from sklearn.model_selection import train_test_split

X_train ,X_test , y_train , y_test = train_test_split(data['Message'] , data['Spam'] , test_size=0.2 , random_state=2022)
print(X_train.shape , X_test.shape , y_train.shape , y_test.shape)

(4457,) (1115,) (4457,) (1115,)


In [115]:
print(y_train.value_counts())
print(y_test.value_counts())

0    3879
1     578
Name: Spam, dtype: int64
0    946
1    169
Name: Spam, dtype: int64


In [116]:
X_train

4422    Still at west coast... Haiz... Ü'll take forev...
5398    Hi. Hope you had a good day. Have a better night.
4880    Yeah just open chat and click friend lists. Th...
3499    You said to me before i went back to bed that ...
4314    I wasn't well babe, i have swollen glands at m...
                              ...                        
5047    Desires- u going to doctor 4 liver. And get a ...
4720    Aiyo... Her lesson so early... I'm still sleep...
173                      What time you coming down later?
1244        No shoot me. I'm in the docs waiting room. :/
4989                            Which channel:-):-):):-).
Name: Message, Length: 4457, dtype: object

In [117]:
X_train.values

array(["Still at west coast... Haiz... Ü'll take forever to come back...",
       'Hi. Hope you had a good day. Have a better night.',
       'Yeah just open chat and click friend lists. Then make the list. Easy as pie',
       ..., 'What time you coming down later?',
       "No shoot me. I'm in the docs waiting room. :/",
       'Which channel:-):-):):-).'], dtype=object)

# Create bag of words representation using CountVectorizer

In [118]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)

In [119]:
print(X_train_cv.shape)

(4457, 7733)


In [120]:
array = X_train_cv.toarray()
array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [121]:
np.where(array[0] != 0)

(array([1095, 1171, 1841, 1875, 2906, 3264, 4130, 6457, 6672, 6925, 7459],
       dtype=int64),)

In [122]:
array[0,1095]

1

In [123]:
v.get_feature_names_out()[1095]

'at'

# Train the Naive Bayes model

In [124]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv , y_train)

In [125]:
X_test_cv = v.transform(X_test)

# Evaluate Performance

In [126]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       946
           1       0.97      0.91      0.94       169

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Train the model using sklearn pipeline

In [129]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer' , CountVectorizer()) ,
    ('nb' , MultinomialNB())
])

In [130]:
clf.fit(X_train , y_train)

In [132]:
y_pred = clf.predict(X_test)

print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       946
           1       0.97      0.91      0.94       169

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

