In [1]:
import pandas as pd
import string

In [2]:
data=pd.read_csv("spamdata.csv")
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

Text Preprocessing

In [4]:
from spacy.lang.en import English
nlp=English()

In [5]:
from spacy.lang.en.stop_words import STOP_WORDS

In [6]:
def clean_text(s):
    cleans=s.lower()
    punct=string.punctuation
    cleans="".join(i for i in cleans if i not in punct)
    doc=nlp(cleans)
    toklst=[]
    for i in doc:
        toklst.append(i.text)
    flt=[]
    for i in toklst:
        le=nlp.vocab[i]
        if le.is_stop==False:
            flt.append(i)
    cleans=" ".join(flt)
    return cleans

In [7]:
data['Cleaned']=data['text'].apply(lambda x: clean_text(x))

In [8]:
data.head()

Unnamed: 0,label,text,Cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


Feature Engineering and Model Training

In [9]:
data['word_count_org']=data['text'].apply(lambda x: len(x.split()))
data['word_count_cln']=data['Cleaned'].apply(lambda x: len(x.split()))
data['char_count_with_space']=data['Cleaned'].apply(lambda x: len(x))
data['char_count-w/o_space']=data['Cleaned'].apply(lambda x: len(x.replace(" ","")))
data['digit_count']=data['Cleaned'].apply(lambda x: sum( [1 if i.isdigit() else 0 for i in x.split()]))

In [10]:
data.head()

Unnamed: 0,label,text,Cleaned,word_count_org,word_count_cln,char_count_with_space,char_count-w/o_space,digit_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


Counting Nouns and Verbs

In [11]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [12]:
dic={"noun":["NNP","NN","NNS","NNPS"],"verb":["VBZ","VB","VBN","VBG","VBD"]}

In [13]:
def pos_tags(s,family):
    doc=nlp(s)
    tags=[]
    for i in doc:
        tags.append(i.tag_)
    c=0
    for i in tags:
        if i in dic[family]:
            c+=1
    return c

In [14]:
data["noun_cnt"]=data["Cleaned"].apply(lambda x: pos_tags(x,"noun"))
data["verb_cnt"]=data["Cleaned"].apply(lambda x: pos_tags(x,"verb"))

In [15]:
data.head()

Unnamed: 0,label,text,Cleaned,word_count_org,word_count_cln,char_count_with_space,char_count-w/o_space,digit_count,noun_cnt,verb_cnt
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,3,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,12,0
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,6,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,1,1


Model Building

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
lb=LabelEncoder()
trgt=data['label'].values
trgt=lb.fit_transform(trgt)

In [18]:
train=data[['word_count_org','word_count_cln','char_count_with_space','char_count-w/o_space','digit_count','noun_cnt','verb_cnt']]

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train,trgt,random_state=20,stratify=trgt)

In [20]:
(x_train.shape,y_train.shape),(x_test.shape,y_test.shape)

(((4179, 7), (4179,)), ((1393, 7), (1393,)))

In [21]:
from sklearn import naive_bayes
md=naive_bayes.MultinomialNB()
md.fit(x_train,y_train)

In [22]:
pred_train=md.predict(x_train)
pred_test=md.predict(x_test)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,pred_train)

0.9418521177315147

In [24]:
accuracy_score(y_test,pred_test)

0.9368269921033741

TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf=TfidfVectorizer(max_features=500)

In [27]:
tfidf.fit(data['Cleaned'].values)

In [28]:
vectors=tfidf.transform(data['Cleaned'].values)

In [29]:
vectors

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 21920 stored elements in Compressed Sparse Row format>

In [30]:
from scipy.sparse import hstack, csr_matrix

In [31]:
meta=['word_count_org','word_count_cln','char_count_with_space','char_count-w/o_space','digit_count','noun_cnt','verb_cnt']
ft_s1=data[meta]
#combining features (meta and tf-idf)
train=hstack([vectors,csr_matrix(ft_s1)],"csr")

In [32]:
x_train,x_test,y_train,y_test=train_test_split(train,trgt,random_state=20,stratify=trgt)
(x_train.shape,y_train.shape),(x_test.shape,y_test.shape)

(((4179, 507), (4179,)), ((1393, 507), (1393,)))

In [33]:
md=naive_bayes.MultinomialNB()
md.fit(x_train,y_train)

In [34]:
pred_train=md.predict(x_train)
pred_test=md.predict(x_test)

In [37]:
pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
accuracy_score(y_train,pred_train)

0.9679349126585307

In [36]:
accuracy_score(y_test,pred_test)

0.9583632447954056