In [8]:
import pandas as pd
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
data = pd.read_csv('spam.csv',encoding='ISO-8859-1', engine = 'python')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Analyze Dataset

In [3]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5575,5575,50,12,6
unique,2,5172,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [4]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis = 1, inplace = True)
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [5]:
lencoder = LabelEncoder()
lencoder.fit(['ham','spam'])
data['v1'] = lencoder.transform(data['v1'])

In [6]:
data.head()
data.iloc[-1]

v1                                   1
v2    Congrats! you won a credit card.
Name: 5574, dtype: object

In [7]:
X, Y = np.array(data['v2']), np.array(data['v1'])
X,Y

(array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
        'Ok lar... Joking wif u oni...',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        ..., 'You won a credit card for free',
        'A credit card only for 100 dollars',
        'Congrats! you won a credit card.'], dtype=object),
 array([0, 0, 1, ..., 1, 1, 1]))

# Cleaning data

In [9]:
stemmer = PorterStemmer()
for i in range(len(X)):
    review = X[i]
    review = re.sub('[^a-zA-Z]'," ",review)
    words = review.lower().split()
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    review = ' '.join(words)
    X[i] = review
X

array(['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
       'ok lar joke wif u oni',
       'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
       ..., 'credit card free', 'credit card dollar',
       'congrat credit card'], dtype=object)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tiv = TfidfVectorizer()
X = tiv.fit_transform(X).toarray()

In [11]:
print(X.shape,Y.shape)
print(X)

(5575, 6221) (5575,)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [13]:
print(X_train.shape,y_train.shape)

(4738, 6221) (4738,)


# Train Model

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(4738, 6221) (4738,) (837, 6221) (837,)


In [16]:
model = MultinomialNB()
model = model.fit(X_train, y_train)

# Accuracy Score

In [17]:
y_pred = model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[720   0]
 [ 30  87]]
0.96415770609319


# Test Line

In [19]:
review = "Offer available on credit card"
review = re.sub('[^a-zA-Z]'," ",review)
words = review.lower().split()
words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
review = ' '.join(words)
review = [review,]
print(review)
p = tiv.transform(review).toarray()
print(p.sum())

['offer avail credit card']
1.9969807996719735


In [20]:
model.predict(p)

array([0])

# Pickle Transform

In [21]:
import pickle

In [22]:
pickle.dump(model,open('trainedModel.pkl','wb'))
pickle.dump(tiv,open('transform.pkl','wb'))