# Pre-processing the Text data:
#### 1. Remove Punctuation
#### 2. Tokenization
#### 3. Remove Stopwords
#### 4. Lemmatize & Stemming

In [19]:
# import libraries and data:
import pandas as pd
import nltk
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

In [3]:
data=pd.read_csv("data//SMSSpamCollection.tsv", sep='\t', header=None)
data.columns = ['label', 'body_text']

In [4]:
data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Remove Punctuation:

In [5]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_punc(var):
    return ''.join([char for char in var if char not in string.punctuation])

In [7]:
#Add new column punc_cleaned :
data['punc_cleaned']=data['body_text'].apply(lambda x:remove_punc(x))

In [8]:
data.head()

Unnamed: 0,label,body_text,punc_cleaned
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


### Tokenization:

In [11]:
#Converting the text data to list of tokens cleaned:
import re

In [12]:
def cleaned_token(var):
    return re.split('\W+',var.lower())

In [13]:
# Another column for the tokens data:
data['cleaned_token']=data['punc_cleaned'].apply(lambda x:cleaned_token(x))

In [14]:
data.head()

Unnamed: 0,label,body_text,punc_cleaned,cleaned_token
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[even, my, brother, is, not, like, to, speak, ..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


### Remove Stopwords:

In [15]:
def remove_stop(var_list):
    return [word for word in var_list if word not in stopwords]

In [16]:
# New column for cleaned stopwords:
data['cleaned_stopword']=data['cleaned_token'].apply(lambda x:remove_stop(x))

In [17]:
data.head()

Unnamed: 0,label,body_text,punc_cleaned,cleaned_token,cleaned_stopword
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"


### Stemming:

In [21]:
# Need to use PrterStemmer Algo from nltk:
# This algorithm just tries to get the root word either by stripping 1/2 chars or by modifying.
# But it is built -in algorithm in nltk lib. No need to download and processing is fast.
ps = nltk.PorterStemmer()

In [22]:
def text_stem(var_list):
    return [ps.stem(word) for word in var_list]

In [23]:
#New column for Clean stem text:
data['stem_text']=data['cleaned_stopword'].apply(lambda x:text_stem(x))

In [24]:
data.head()

Unnamed: 0,label,body_text,punc_cleaned,cleaned_token,cleaned_stopword,stem_text
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather...","[ive, search, right, word, thank, breather, pr..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho..."
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]"


### Lemmatizer Cleaning:

In [29]:
# Lemma means root word. Here in this type of technique we exactly get the root word with accuracy but with stemming 
# accuracy was not there.This process is slow as it needs to be downloaded and then again implemented. But results are accurate.
# Prerequisites are [WordNetLemmatizer] 

In [28]:
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
def word_lemmatize(var_list):
    return [wn.lemmatize(word) for word in var_list]

In [31]:
#Create new column to store cleaned lemmatize data:
data['lemmatized_text']=data['cleaned_stopword'].apply(lambda x:word_lemmatize(x))

In [32]:
data.head()

Unnamed: 0,label,body_text,punc_cleaned,cleaned_token,cleaned_stopword,stem_text,lemmatized_text
0,ham,I've been searching for the right words to tha...,Ive been searching for the right words to than...,"[ive, been, searching, for, the, right, words,...","[ive, searching, right, words, thank, breather...","[ive, search, right, word, thank, breather, pr...","[ive, searching, right, word, thank, breather,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
2,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, goe, usf, live, around, tho...","[nah, dont, think, go, usf, life, around, though]"
3,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...","[even, brother, like, speak, treat, like, aid,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]","[date, sunday]"
