In [323]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, ComplementNB, BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

import re

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [324]:
sms_spam_collection = pd.read_csv("data/smsspamcollection.csv", sep='\t', header=None, names=['label', 'text'])
sms_spam_collection

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [325]:
sms_spam_collection['text'] = sms_spam_collection['text'].str.lower()
sms_spam_collection['text'] = sms_spam_collection['text'].apply(lambda x: re.sub('[.,?!\'\"]', '', x)) 

X = sms_spam_collection['text']
y = sms_spam_collection['label']

sms_spam_collection

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home
5569,ham,pity * was in mood for that soany other sugges...
5570,ham,the guy did some bitching but i acted like id ...


In [326]:
vectorizer = CountVectorizer()
vectorized_data = vectorizer.fit_transform(X).toarray()

column_names = vectorizer.get_feature_names_out() 
print(len(column_names))
print(column_names)
vectorized_data

9327
['008704050406' '0089' '0121' ... 'ú120' 'üll' '〨ud']


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [327]:
vectorized_data_df = pd.DataFrame(vectorized_data, columns=column_names)
vectorized_data_df

Unnamed: 0,008704050406,0089,0121,01223585236,01223585334,0125698789,02,0207,02072069400,02073162414,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,ú120,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [328]:
sms_vectorized = pd.concat([sms_spam_collection, vectorized_data_df], axis = 1)
sms_vectorized

Unnamed: 0,label,text,008704050406,0089,0121,01223585236,01223585334,0125698789,02,0207,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,ú120,üll,〨ud
0,ham,go until jurong point crazy available only in ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,ok lar joking wif u oni,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,u dun say so early hor u c already then say,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,nah i dont think he goes to usf he lives aroun...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,ham,will ü b going to esplanade fr home,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,ham,pity * was in mood for that soany other sugges...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,ham,the guy did some bitching but i acted like id ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [329]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(sms_vectorized.iloc[0])

label                                                                                      ham
text                                         go until jurong point crazy available only in ...
008704050406                                                                                 0
0089                                                                                         0
0121                                                                                         0
01223585236                                                                                  0
01223585334                                                                                  0
0125698789                                                                                   0
02                                                                                           0
0207                                                                                         0
02072069400                                       

In [330]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_data_df, y, train_size=0.8)

In [331]:
model = BernoulliNB()
model.fit(X_train, y_train)

In [332]:
predicted = model.predict(X_test)
expected = y_test
 
print(f"Dokładność modelu: {metrics.accuracy_score(expected, predicted)}\n")
print(f"Macierz pomyłek: \n {metrics.confusion_matrix(expected, predicted)}")

Dokładność modelu: 0.9865470852017937

Macierz pomyłek: 
 [[980   3]
 [ 12 120]]


In [333]:
print(metrics.classification_report(expected, predicted))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       983
        spam       0.98      0.91      0.94       132

    accuracy                           0.99      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

