In [1]:
import re
import pandas as pd
import nltk
import string
ps=nltk.PorterStemmer()

In [2]:
stopwords=nltk.corpus.stopwords.words('english')

In [3]:
data=pd.read_csv("SMSSpamCollection.tsv",sep='\t',header=None)

In [4]:
data.columns=["label","body"]

In [5]:
data.head()

Unnamed: 0,label,body
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [6]:
#Cleaning the data

In [64]:
def clean_data(text):
    words="".join([char for char in text if char not in string.punctuation])
    tokenize=re.split('\W+',words)
    clean=" ".join([ps.stem(word) for word in tokenize if word not in stopwords])
    return clean

In [65]:
data["clean_data"]=data["body"].apply(lambda x:clean_data(x.lower()))

In [66]:
data.head()

Unnamed: 0,label,body,clean_data,body_stem,body_stem_lem
0,ham,I've been searching for the right words to tha...,ive search right word thank breather promis wo...,"[ive, search, right, word, thank, breather, pr...","[ive, search, right, word, thank, breather, pr..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,"[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though,"[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, live, around, tho..."
3,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aid patent,"[even, brother, like, speak, treat, like, aid,...","[even, brother, like, speak, treat, like, aid,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday,"[date, sunday]","[date, sunday]"


In [10]:
#Stem text

In [11]:
def stemming(tokenized_list):
    text=[ps.stem(word) for word in tokenized_list]
    return text

In [12]:
data["body_stem"]=data["clean_data"].apply(lambda x:stemming(x))

In [13]:
data.head()

Unnamed: 0,label,body,clean_data,body_stem
0,ham,I've been searching for the right words to tha...,"[ive, searching, right, words, thank, breather...","[ive, search, right, word, thank, breather, pr..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho..."
3,ham,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]"


In [14]:
#Supplement data cleaning using a lemmatizer

In [15]:
wn=nltk.WordNetLemmatizer()

In [16]:
def lemmatizing(tokenized_list):
    text=[wn.lemmatize(word) for word in tokenized_list]
    return text

In [17]:
data["body_stem_lem"]=data["body_stem"].apply(lambda x:lemmatizing(x))

In [18]:
data.head()

Unnamed: 0,label,body,clean_data,body_stem,body_stem_lem
0,ham,I've been searching for the right words to tha...,"[ive, searching, right, words, thank, breather...","[ive, search, right, word, thank, breather, pr...","[ive, search, right, word, thank, breather, pr..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, goe, usf, live, around, tho..."
3,ham,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[even, brother, like, speak, treat, like, aid,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]","[date, sunday]"


In [21]:
#Apply Count Vectorizer

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
count_vect=CountVectorizer(analyzer=clean_data)

In [54]:
X_counts=count_vect.fit_transform(data["body"])

In [55]:
print(X_counts.shape)

(5568, 8191)


In [42]:
#Apply CountVectorizer to smaller sample

In [56]:
data_sample=data[0:20]
count_vect_sample=CountVectorizer(analyzer=clean_data)
X_counts_sample=count_vect_sample.fit_transform(data_sample['body'])
print(X_counts_sample.shape)

(20, 220)


In [57]:
X_counts_sample

<20x220 sparse matrix of type '<class 'numpy.int64'>'
	with 253 stored elements in Compressed Sparse Row format>

In [59]:
X_count_df=pd.DataFrame(X_counts_sample.toarray())

In [60]:
X_count_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# Ngrams

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

In [68]:
ngram_vect=CountVectorizer(ngram_range=(2,2))
X_count=ngram_vect.fit_transform(data["clean_data"])
print(X_count.shape)



(5568, 31275)


In [70]:
X_count

<5568x31275 sparse matrix of type '<class 'numpy.int64'>'
	with 43729 stored elements in Compressed Sparse Row format>

In [71]:
#Applying Ngram to sample data

In [74]:
data_sample=data[0:20]
ngram_vect=CountVectorizer(ngram_range=(2,2))
X_sample_count=ngram_vect.fit_transform(data_sample["clean_data"])
print(X_sample_count.shape)

(20, 209)


In [75]:
X_counts_df=pd.DataFrame(X_sample_count.toarray())

In [76]:
X_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,199,200,201,202,203,204,205,206,207,208
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,1,...,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
