In [1]:
import pandas as pd
import numpy as np 

In [2]:
df = pd.read_csv('spam_ham.csv')
df.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [3]:
df = df.where(pd.notnull(df))
df

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...
5169,ham,Subject: industrial worksheets for august 2000...


In [5]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['type'] = encoder.fit_transform(df.label)
df

Unnamed: 0,label,text,type
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [7]:
x = df['text']
y = df['type']

In [8]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 3)
print(xtrain.shape, xtest.shape)
print(ytrain.shape, ytest.shape)

(4136,) (1035,)
(4136,) (1035,)


In [9]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tf = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

xtr = tf.fit_transform(xtrain)
xts = tf.transform(xtest)

ytr = ytrain.astype('int')
yts = ytest.astype('int')

In [11]:
with open('tfidf_vectorizer_vocab.pkl', 'wb') as file:
    pickle.dump(tf, file)

In [12]:
for i in xtr:
    print(i)

  (0, 3871)	0.13387711316973605
  (0, 531)	0.14556222812251965
  (0, 30451)	0.08468916670398006
  (0, 43273)	0.14556222812251965
  (0, 3890)	0.14556222812251965
  (0, 548)	0.14556222812251965
  (0, 37262)	0.11275796314501375
  (0, 2908)	0.11535664415295803
  (0, 456)	0.14556222812251965
  (0, 26297)	0.09506000151609588
  (0, 36190)	0.11400727959297849
  (0, 2478)	0.13872687405852518
  (0, 521)	0.14556222812251965
  (0, 16808)	0.11843023142166303
  (0, 22041)	0.13387711316973605
  (0, 2706)	0.14556222812251965
  (0, 522)	0.14556222812251965
  (0, 32060)	0.07311834410351342
  (0, 19411)	0.04211028825505044
  (0, 2537)	0.13872687405852518
  (0, 517)	0.14556222812251965
  (0, 19429)	0.14556222812251965
  (0, 16637)	0.24438399643390496
  (0, 3875)	0.14556222812251965
  (0, 836)	0.14556222812251965
  (0, 31384)	0.14556222812251965
  (0, 2844)	0.11843023142166303
  (0, 1177)	0.13387711316973605
  (0, 2645)	0.12444307809779734
  (0, 27743)	0.44166331773844575
  (0, 1179)	0.13387711316973605
  

Model Building

In [13]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(xtr, ytr)

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(xtr, ytr)

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=2)
knn_model.fit(xtr, ytr)

In [16]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(xtr, ytr)

Accuracy

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
ypred = lr_model.predict(xtr)
lr_acc = accuracy_score(ytr,ypred)
lr_acc

0.9970986460348162

In [19]:
ypred = rf_model.predict(xtr)
rf_acc = accuracy_score(ytr,ypred)
rf_acc

1.0

In [20]:
ypred = knn_model.predict(xtr)
knn_acc = accuracy_score(ytr,ypred)
knn_acc

0.968568665377176

In [21]:
ypred = nb_model.predict(xtr)
nb_acc = accuracy_score(ytr,ypred)
nb_acc

0.9683268858800773

Saving the Modal

In [22]:
import pickle

with open('model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

Using the Model

In [23]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

with open('tfidf_vectorizer_vocab.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

text_to_check = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
text_vectorized = tfidf_vectorizer.transform(text_to_check)

prediction = model.predict(text_vectorized)

if prediction[0] == 1:
    print("spam.")
else:
    print("ham.")

spam.
