# Demo - Understanding data set

In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
data = pd.read_csv('https://techlearn-cdn.s3.amazonaws.com/project_GmailSpamClassification/spam.csv',encoding='cp1252')

In [4]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data = data[['v1','v2']]

In [7]:
data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data['v1'] = data['v1'].apply(lambda x:0 if x=='ham' else 1)

In [8]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# - Text Pre-Processing

In [9]:
def process(x):
    temp = []
    document = nlp(x.lower())
    print(document)
    for i in document:
        if i.is_stop!=True and i.is_punct!= True:
            print(i)
            temp.append(i.lemma_)
            print(temp)
        else:
            pass
        
    return (' '.join(temp))

In [10]:
data['v2'] = data['v2'].apply(lambda x: process(x))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['dunno']
leh
['dunno', 'leh']
nt
['dunno', 'leh', 'not']
remember
['dunno', 'leh', 'not', 'remember']
mayb
['dunno', 'leh', 'not', 'remember', 'mayb']
lor
['dunno', 'leh', 'not', 'remember', 'mayb', 'lor']
wat
['dunno', 'leh', 'not', 'remember', 'mayb', 'lor', 'wat']
time
['dunno', 'leh', 'not', 'remember', 'mayb', 'lor', 'wat', 'time']
r
['dunno', 'leh', 'not', 'remember', 'mayb', 'lor', 'wat', 'time', 'r']
meeting
['dunno', 'leh', 'not', 'remember', 'mayb', 'lor', 'wat', 'time', 'r', 'meet']
tmr
['dunno', 'leh', 'not', 'remember', 'mayb', 'lor', 'wat', 'time', 'r', 'meet', 'tmr']
best msg: it's hard to be with a person, when u know that one more step foward will make u fall in love.. &amp; one step back can ruin ur friendship.. good night:-) ...
best
['good']
msg
['good', 'msg']
hard
['good', 'msg', 'hard']
person
['good', 'msg', 'hard', 'person']
u
['good', 'msg', 'hard', 'person', 'u']
know
['good', 'msg', 'hard', 'p

In [12]:
data.head()

Unnamed: 0,v1,v2
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joke wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun early hor u c
4,0,nah think go usf live


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')

In [13]:
text_vector = vectorizer.fit_transform(data['v2'].values.tolist())

In [None]:
print(text_vector)

#  - Splitting Data set

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(text_vector.toarray(),data['v1'],test_size=0.2,random_state=20)

In [18]:
len(x_train)

4457

# Model Building

In [19]:
from sklearn.naive_bayes import BernoulliNB

In [20]:
modelB = BernoulliNB()
modelB.fit(x_train,y_train)
print(modelB.score(x_train,y_train))

0.9845187345748261


In [21]:
y_predictedB = modelB.predict(x_test)

In [22]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_predictedB))

0.9829596412556054


## 98% Accuracy