In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

Clean Datsets

In [32]:
firstDataset = pd.read_csv("spam_data_A.csv")
secondDataset = pd.read_csv("spam_data_B.csv")


In [33]:
print(firstDataset.shape)

firstDataset.head()

(3000, 2)


Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [34]:
firstDataset = firstDataset[[firstDataset.columns[1]] + firstDataset.columns.tolist()[:1] + firstDataset.columns.tolist()[2:]]
firstDataset.head()

Unnamed: 0,label,email
0,0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...
1,0,martin a posted tassos papadopoulos the greek ...
2,0,man threatens explosion in moscow thursday aug...
3,0,klez the virus that won t die already the most...
4,0,in adding cream to spaghetti carbonara which ...


In [35]:
print(secondDataset.shape)

secondDataset.head()

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
secondDataset.rename(columns={'Category': 'label', 'Message': 'email'}, inplace=True)

secondDataset['label'] = secondDataset['label'].apply(lambda x: 1 if 'spam' in x else 0)

secondDataset.head()

Unnamed: 0,label,email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
print(firstDataset.columns)
print(secondDataset.columns)

Index(['label', 'email'], dtype='object')
Index(['label', 'email'], dtype='object')


In [38]:
print(firstDataset.shape)
print(secondDataset.shape)

data = pd.concat([firstDataset, secondDataset], axis=0, ignore_index=True)

print(data.shape)
data.head()

(3000, 2)
(5572, 2)
(8572, 2)


Unnamed: 0,label,email
0,0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...
1,0,martin a posted tassos papadopoulos the greek ...
2,0,man threatens explosion in moscow thursday aug...
3,0,klez the virus that won t die already the most...
4,0,in adding cream to spaghetti carbonara which ...


In [39]:
print(data.isnull().sum())
data = data.dropna()
print(data.isnull().sum())


label    0
email    1
dtype: int64
label    0
email    0
dtype: int64


In [40]:
X = data.iloc[:, 1]
y = data.iloc[:, 0]

cv = CountVectorizer()
X_count = cv.fit_transform(X).toarray()

print(X_count.shape)

(8571, 38215)


In [41]:
train_X, test_X, train_y, test_y = train_test_split(X_count, y, test_size=0.2, random_state=42)

model = MultinomialNB()

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

grid_search.fit(train_X, train_y)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validated score: {:.2f}".format(grid_search.best_score_))

model = grid_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'alpha': 1.5, 'fit_prior': True}
Best cross-validated score: 0.97


In [42]:
test_self_ham = ["Down to go to the mall or what?"]

test_self_ham = cv.transform(test_self_ham)

model.predict(test_self_ham)

array([0], dtype=int64)

In [43]:
test_self_spam = ["Click here to redeem your reward"]

test_self_spam = cv.transform(test_self_spam)

model.predict(test_self_spam)

array([1], dtype=int64)

In [44]:
model.score(test_X, test_y)

0.9714285714285714

In [45]:
import pickle

pickle.dump(model, open('Naive_model.pkl', 'wb'))
pickle.dump(cv, open('Vectorizer.pkl', 'wb'))