In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv("spam.csv", encoding = "ISO-8859-1")

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)

In [5]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
data.shape

(5572, 2)

In [7]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
data["v2"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [9]:
from nltk import RegexpTokenizer

In [10]:
regex=r'[A-Za-z]+'

In [11]:
tokenizer=RegexpTokenizer(regex,gaps=False)

In [13]:
import spacy
import nltk
nlp=spacy.load("en_core_web_sm")
words = set(nltk.corpus.words.words())



In [14]:
def text_preprocess(text):
    doc=nlp(text)
    t1=[i.lemma_ for i in doc]
    t1=" ".join(token for token in tokenizer.tokenize(" ".join(t1)) if token.lower() in words or not token.isalpha() )
    t2=[token.text for token in nlp(t1) if (not (token.is_stop)) if len(token)>2]
    return " ".join(t2)

In [15]:
df1=data.drop("v2",axis=1)
df1.v1.replace({"ham":0,"spam":1},inplace=True)
df1.head()

Unnamed: 0,v1
0,0
1,0
2,1
3,0
4,0


In [16]:
df1["text_preprocessed"]=data.v2.apply(text_preprocess)

In [17]:
df1

Unnamed: 0,v1,text_preprocessed
0,0,point crazy available great world buffet Cine wat
1,0,lar joke
2,1,free entry win Cup final text receive entry qu...
3,0,dun early
4,0,think live
...,...,...
5567,1,time try contact win Pound prize claim easy mi...
5568,0,esplanade home
5569,0,pity mood suggestion
5570,0,guy act like interested buy week free


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vector=CountVectorizer(analyzer='word', ngram_range=(1, 2))

In [20]:
vector_words=vector.fit_transform(df1["text_preprocessed"])
vector_words.shape

(5572, 19993)

In [21]:
df2=pd.DataFrame(vector_words.toarray(),columns=vector.get_feature_names_out())

In [22]:
df3=pd.concat([df2,df1.drop("text_preprocessed",axis=1)],axis=1)

In [23]:
df3.head()

Unnamed: 0,abbey,abbey happy,ability,ability listen,ability question,able,able buy,able class,able come,able deliver,...,yummy lip,zac,zac stand,zebra,zebra animation,zero,zero saving,zoom,zoom cine,v1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
X=df3.drop("v1",axis=1)
Y=df3.v1

In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=.2,random_state=42)

In [28]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()

In [27]:
mnb.fit(x_train,y_train)
mnb.score(x_train,y_train),mnb.score(x_test,y_test)

(0.9925959165357864, 0.9695067264573991)