**Importing the required libraries**

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import pandas as pd
import sklearn
import regex as re
import numpy as np
from zipfile import ZipFile
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
import string
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

**Reading the data**

In [2]:
path = './data/SMSSpamCollection'

In [3]:
def read_txt(path):
    messages = [line.rstrip() for line in open(path)]
    print("No. of rows of data =",len(messages))
    return messages

In [4]:
def read_csv(path):
    messages = pd.read_csv(path, sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
    print("No. of rows of data",len(messages))
    return messages

In [5]:
#Calling the function read_txt to read the data from the provided directory path
data_txt = read_txt(path)

#Printing the 1st 10 rows of the data
for idx, info in enumerate(data_txt[:10]):
    print(idx, info)

No. of rows of data = 5574
0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1 ham	Ok lar... Joking wif u oni...
2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3 ham	U dun say so early hor... U c already then say...
4 ham	Nah I don't think he goes to usf, he lives around here though
5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv
6 ham	Even my brother is not like to speak with me. They treat me like aids patent.
7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8 spam	WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461

In [6]:
#Calling the function read_csv to read the data as pandas dataframe from the provided directory path
data_df = read_csv(path)

#Printing the 1st 10 rows of the data
data_df.head(10)


No. of rows of data 5574


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


**Preprocessing the data**

In [7]:
#Function to split the words into tokens
def split_into_tokens(data):
    tokenized_words = []
    regex=r"\w+"
    
    for i in range(len(data.message)):
        tokenized_words.append(re.findall(regex, data.message[i]))
        
    return tokenized_words

In [1]:
#Function to perform lematization and stopword removal
def lemmatize(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    
    for i in range(len(data)):
        temp = []

        for j in range(len(data[i])):
        
            if data[i][j].lower() in stop_words:
                continue
            
            elif data[i][j] in string.punctuation:
                continue
            
            else:
                temp.append(str(lemmatizer.lemmatize(data[i][j]).lower()))

        lemmatized_words.append(temp)             

    return lemmatized_words

In [9]:
#Calling the required functions for pre-processing
token_words = split_into_tokens(data_df)
processed_words = lemmatize(token_words)

data_df['processed_message'] = processed_words

**Performing train, test and vaidation split**

In [10]:
#Train-validation and test split
train_test_split_size = 0.1
X_train_val, X_test, y_train_val, y_test = train_test_split(data_df.processed_message, data_df.label, test_size = train_test_split_size, random_state = 42)

In [11]:
#Train and Validation split
train_val_split_size = 0.1
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = train_val_split_size, random_state = 42)

In [12]:
#Creating the splitted dataframes
train_df = pd.DataFrame({'X_train': X_train,'y_train': y_train})
val_df = pd.DataFrame({'X_val': X_val,'y_val': y_val})
test_df = pd.DataFrame({'X_test': X_test,'y_test': y_test})

**Storing the splits and the modified data frame in csv format**

In [16]:
train_df.to_csv('./data/train.csv',index = False)
val_df.to_csv('./data./validation.csv',index = False)
test_df.to_csv('./data/test.csv',index = False)
data_df.to_csv('./data/mod_df.csv',index = False)