### Import Necessary Libraries

In [10]:
# Data Analysis and Manipulation
import pandas as pd
import numpy as np
import re
import string

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# NLP
import nltk
from nltk import word_tokenize

### Load the Dataset

In [11]:
df = pd.read_csv('../data/processed/spam_v2.csv')

In [12]:
df.head()

Unnamed: 0,target,text,num_chars
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


### Data Preprocessing

We will be transforming the text as follow:
- Lowercase
- Tokenization
- Remove URL and HTML Tags
- Remove Special Characters
- Remove Newlines
- Remove Punctuations
- Lemmatization - To bring back words in its root form without changing the meaning of the text.

Note that we will not be removing stopwords as it is likely to change the context of the text. Also, we will be removing the numbers from the string not converting it into words as there are phone numbers to make SMS for certain service. Converting them into word form will certainly change the context. 

In [13]:
def remove_url(text):
    pattern = re.compile(r'https?: ?//\S+|www\.\S+')
    return pattern.sub(r'', text)

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

def remove_special_char(text):
    text = nltk.word_tokenize(text) 
    text = [i for i in text if i.isalnum()]
    return ' '.join(text)

lemma = nltk.wordnet.WordNetLemmatizer()

def lemmatizer(text):
    text = ' '.join(lemma.lemmatize(word) for word in text.split(' '))
    return text

def preprocess_text(text):
    '''
    Transform the original text by:
        Lowercase,
        Remove URL and HTML tags,
        Remove all Special Characters and Punctuation,
        Remove Newlines,
        Remove the words containing numbers,
        Lemmatize the text
    '''
    text = str(text).lower()
    text = remove_url(text)
    text = remove_html_tags(text)
    text = remove_special_char(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = lemmatizer(text)
    return text

In [14]:
df['text_clean'] = df['text'].apply(preprocess_text)
df['num_chars_clean'] = df['text_clean'].apply(len)

In [15]:
df

Unnamed: 0,target,text,num_chars,text_clean,num_chars_clean
0,0,"Go until jurong point, crazy.. Available only ...",111,go until jurong point crazy available only in ...,102
1,0,Ok lar... Joking wif u oni...,29,ok lar joking wif u oni,23
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,free entry in a wkly comp to win fa cup final...,118
3,0,U dun say so early hor... U c already then say...,49,u dun say so early hor u c already then say,43
4,0,"Nah I don't think he goes to usf, he lives aro...",61,nah i do think he go to usf he life around her...,54
...,...,...,...,...,...
5164,1,This is the 2nd time we have tried 2 contact u...,161,this is the time we have tried contact u hav...,106
5165,0,Will Ì_ b going to esplanade fr home?,37,will b going to esplanade fr home,33
5166,0,"Pity, * was in mood for that. So...any other s...",57,pity wa in mood for that so any other suggestion,48
5167,0,The guy did some bitching but I acted like i'd...,125,the guy did some bitching but i acted like i b...,122


To understand our data more better, lets visualize the tokens.

In [16]:
df.to_csv('../data/processed/spam_v3.csv', index=False)