In [None]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#from nltk.tokenize import RegexpTokenizer  
#from nltk.corpus import stopwords
#from nltk.stem.snowball import SnowballStemmer

2. Import data

In [None]:
data = pd.read_csv("../data/spam.csv", encoding='latin-1', usecols=["v1","v2"])

In [None]:
# v1: label (spam or not spam)
# v2: out features
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data = data.rename(columns={"v1":"label", "v2": "text"})

In [None]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

Convert labels to numerical variables

In [None]:
data['label_num'] = data.label.map({'ham':0, 'spam':1})
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
#le.fit(data['label'])
#label = le.transform(data['label'])
#print(np.unique(label))
#print(np.unique(data['label_num']))

In [None]:
data.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# so now, after converting label into numerical values, ie label_num column, we are
# working with this newly created column (it becomes our y vector, text => X vector)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label_num"], test_size = 0.2, random_state = 42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Scikit-learn’s CountVectorizer is used to 
# convert a collection of text documents to a vector of term/token counts.
# list of text documents
text = ["I can't live, if living is without you, I can't live, I can't give any more."]

vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)
print("vocab: ")
print(vectorizer.vocabulary_)

print("nb of tokens/words: ", len(vectorizer.vocabulary_))
# encode document
# to encode text into vector
vector = vectorizer.transform(["can you go with me you hddhdh kdkdkdk"])
# summarize encoded vector
print("dimension of the encoded vector")
print(vector.shape)

print("vector")
print(vector.toarray())

vocab: 
{'can': 1, 'live': 5, 'if': 3, 'living': 6, 'is': 4, 'without': 8, 'you': 9, 'give': 2, 'any': 0, 'more': 7}
nb of tokens/words:  10
dimension of the encoded vector
(1, 10)
vector
[[0 1 0 0 0 0 0 0 0 1]]


In [None]:
v1 = vectorizer.transform(["I can go now"])

In [None]:
v1.toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
vectorizer.get_feature_names()

['any', 'can', 'give', 'if', 'is', 'live', 'living', 'more', 'without', 'you']

In [None]:
# we are applying CountVectorizer on our spam dataset

In [None]:
vect = CountVectorizer() # create an instance of CountVectorizer

In [None]:
vect.fit(X_train) # # tokenize and build vocab

CountVectorizer()

In [None]:
print(vect.get_feature_names()[0:10])
print(vect.get_feature_names()[-10:])

['00', '000', '000pes', '008704050406', '0089', '0121', '01223585236', '0125698789', '02', '0207']
['ìï', 'û_', 'û_thanks', 'ûªm', 'ûªt', 'ûªve', 'ûï', 'ûïharry', 'ûò', 'ûówell']


In [None]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
#print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary size: 7735


In [None]:
# encode your data
X_train_df = vect.transform(X_train)

In [None]:
X_train_df[:3].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2], dtype=int32),
 array([1105, 1415, 1758, 3308, 3416, 3637, 4390, 4549, 4646, 4661, 4773,
        4912, 4988, 5872, 6455, 6786, 7674,    0,  419, 1218, 1580, 1701,
        2741, 2954, 3237, 3739, 3749, 4423, 4675, 4869, 4986, 5003, 5126,
        5178, 6494, 6639, 6781, 7152, 1726, 2246, 3157, 5526, 5948, 6809,
        6906], dtype=int32))

In [None]:
#(0,1105) is a non-zero element

In [None]:
prediction = dict() # a dict to store the output/results/performance of different ML methods
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df, y_train) # X_train_df: count vectors

MultinomialNB()

In [None]:
X_test_df = vect.transform(X_test)
prediction["Multinomial"] = model.predict(X_test_df)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accuracy_score(y_test,prediction["Multinomial"])

0.9838565022421525

In [None]:
print(classification_report(y_test, prediction["Multinomial"]))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
conf_mat = confusion_matrix(y_test, prediction['Multinomial'])
print(conf_mat)
conf_mat_normalized = conf_mat.astype('float')/conf_mat.sum(axis=1)[:, np.newaxis]

[[963   2]
 [ 16 134]]


In [None]:
print(conf_mat_normalized)

[[0.99481865 0.00518135]
 [0.05333333 0.94666667]]


In [None]:
print("train score:", model.score(X_train_df, y_train))
print("test score:", model.score(X_test_df, y_test))

train score: 0.9943908458604442
test score: 0.9838565022421525
