# Demo - Understanding data set

In [None]:
import pandas as pd # mainly works with tabular data(spreadsheets)
import numpy as np  # for calculations
import spacy    #  natural language processing 

In [None]:
nlp = spacy.load('en_core_web_sm') # english language 

In [None]:
data = pd.read_csv('spam.csv',encoding='cp1252')  # data set  

In [None]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data = data[['v1','v2']]

In [None]:
data['v1'] = data['v1'].apply(lambda x:0 if x=='ham' else 1)    # if(x=="ham") 0 else 1

In [None]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Demo 2 - Text Pre-Processing

In [None]:
def process(x):
    temp = []
    document = nlp(x.lower())   # tokenization
    for i in document:
        if i.is_stop!=True and i.is_punct!= True:  # stop words removal
            temp.append(i.lemma_)   #lemmatization
        else:
            pass
        
    return (' '.join(temp))  # make sentence 

In [None]:
data['v2'] = data['v2'].apply(lambda x: process(x))

In [None]:
data.head()

Unnamed: 0,v1,v2
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joke wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun early hor u c
4,0,nah think go usf live


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')  # vectorization : words and pharses into vectors

In [None]:
text_vector = vectorizer.fit_transform(data['v2'].values.tolist())

In [None]:
print(text_vector)

  (0, 7072)	0.19863129993143025
  (0, 1016)	0.35890799882424634
  (0, 1839)	0.3032069277494209
  (0, 1590)	0.34261647990985794
  (0, 3887)	0.3032069277494209
  (0, 7242)	0.24060094265073245
  (0, 3166)	0.1975304138199145
  (0, 1592)	0.3032069277494209
  (0, 1209)	0.2684514781881452
  (0, 2071)	0.2729205052435625
  (0, 5116)	0.257424652517704
  (0, 3779)	0.35890799882424634
  (1, 4789)	0.5647537939557097
  (1, 7165)	0.4459451111953121
  (1, 3746)	0.47451057922863127
  (1, 3921)	0.4218684931830353
  (1, 4765)	0.2811632882742994
  (2, 77)	0.2395244966956236
  (2, 1088)	0.17132935684973827
  (2, 5400)	0.16473274253951514
  (2, 6786)	0.1254671678341206
  (2, 6201)	0.2005836640243086
  (2, 5355)	0.16820504287497096
  (2, 5442)	0.16539371122554727
  (2, 775)	0.22517546975215236
  :	:
  (5567, 460)	0.24331799189705156
  (5567, 6755)	0.1924906650388016
  (5567, 4554)	0.25504800078010476
  (5567, 6610)	0.1645623122369073
  (5567, 5171)	0.2459387918665145
  (5567, 1845)	0.19354805892700622
  (556

# Demo 3 - Splitting Data set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(text_vector.toarray(),data['v1'],test_size=0.2,random_state=20)

# Demo 4 - Model Building

In [None]:
from sklearn.naive_bayes import BernoulliNB   #bayes theorem of Probability uses (naive_bayes)  BernoulliNB basiccally for Binary classification 

# NAIVE BAYES CLASSIFIER -> two tyes : Binary(BernoulliNB) or Multi(MultinomialNB)

In [None]:
modelB = BernoulliNB()
modelB.fit(x_train,y_train)
print(modelB.score(x_train,y_train))

0.9845187345748261


In [None]:
y_predictedB = modelB.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score  

print(accuracy_score(y_test,y_predictedB))

0.9829596412556054


## Best model is BernoulliNB with  98% Accuracy