## Compare NLP Techniques: Prep The Data For Modeling

### Read In & Clean Text

In [3]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split

stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('data/spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
messages.columns = ['label', 'text']

messages['label'] = messages['label'].map({'spam':1, 'ham':0})


def clean_text(text):
    text = ''.join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [4]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'], messages['label'], test_size=0.2)

In [5]:
# What do the first ten messages in the training set look like?
X_train[:10]

5443    [guaranteed, 32000, award, maybe, even, å, 100...
352     [today, sorry, day, ever, angry, ever, misbeha...
5327                           [wishing, wonderful, week]
62                                   [part, checking, iq]
4424                              [saw, messageit, k, da]
522                               [gonna, go, get, tacos]
5244     [thanks, temales, wonderful, thank, great, week]
3146    [oh, thats, late, well, good, night, give, u, ...
2176                         [get, ready, moan, scream, ]
3477              [ask, around, theres, lot, terms, mids]
Name: clean_text, dtype: object

In [6]:
# What do the labels look like?
y_train[:10]

5443    1
352     0
5327    0
62      0
4424    0
522     0
5244    0
3146    0
2176    0
3477    0
Name: label, dtype: int64

In [7]:
# Let's save the training and test sets to ensure we are using the same data for each model
X_train.to_csv('data/X_train.csv', index=False, header=True)
X_test.to_csv('data/X_test.csv', index=False, header=True)
y_train.to_csv('data/y_train.csv', index=False, header=True)
y_test.to_csv('data/y_test.csv', index=False, header=True)