In [None]:
import pandas as pd                                                          # type: ignore
from sklearn.feature_extraction.text import CountVectorizer                  # type: ignore

sms = pd.read_table('sms.tsv', header=None, names=['label', 'message'])
print(sms.shape)
sms.head()

(5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
## examine the class distribution
sms.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
## convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
print(sms.shape)

sms.head()

(5572, 3)


Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
## how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape,"\n==============")

from sklearn.model_selection import train_test_split    # type: ignore
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(f'X_train shape => {X_train.shape}')
print(f'X_test  shape => {X_test.shape}')
print(f'y_train shape => {y_train.shape}')
print(f'y_test  shape => {y_test.shape}')

(5572,)
(5572,) 
X_train shape => (4179,)
X_test  shape => (1393,)
y_train shape => (4179,)
y_test  shape => (1393,)


In [None]:
## instantiate the vectorizer
vect = CountVectorizer()

X_train_dtm = vect.fit_transform(X_train)

print(X_train_dtm.toarray())
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names_out())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

print(X_test_dtm.toarray())
pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names_out())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1389,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.naive_bayes import MultinomialNB       # type: ignore
nb = MultinomialNB()

## train the model using X_train_dtm (timing it with an IPython "magic command")
nb.fit(X_train_dtm, y_train)

## make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [None]:
## calculate accuracy of class predictions
from sklearn import metrics     # type: ignore
print(metrics.accuracy_score(y_test, y_pred_class),"\n")

print(metrics.confusion_matrix(y_test, y_pred_class))

0.9885139985642498 

[[1203    5]
 [  11  174]]


In [None]:
## calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

In [None]:
## calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9866431000536962