# Compare NLP Techniques: Prep The Data For Modeling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read In & Clean Text

In [3]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['label'] = np.where(messages['label']=='spam', 1, 0)

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [4]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['label'], test_size=0.2)

In [5]:
# What do the first ten messages in the training set look like?
X_train[:]

3135    [let, know, details, fri, u, find, cos, im, to...
4656    [private, 2003, account, statement, shows, 800...
2812    [enjoy, showers, possessiveness, poured, u, ur...
2012    [beautiful, truth, gravity, read, carefully, h...
2456                              [onum, ela, pa, normal]
                              ...                        
1596    [registered, subscriber, yr, draw, 4, å, 100, ...
279        [done, handed, celebrations, full, swing, yet]
1130                         [howz, painhope, u, r, fine]
862     [sorry, guess, whenever, get, hold, connection...
466     [dont, put, stuff, roads, keep, getting, slipp...
Name: clean_text, Length: 4457, dtype: object

In [6]:
# What do the labels look like?
y_train[:10]

3135    0
4656    1
2812    0
2012    0
2456    0
5498    0
2863    0
381     0
1252    0
307     0
Name: label, dtype: int64

In [7]:
# Let's save the training and test sets to ensure we are using the same data for each model
X_train.to_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_train.csv', index=False, header=True)
X_test.to_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/X_test.csv', index=False, header=True)
y_train.to_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_train.csv', index=False, header=True)
y_test.to_csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn Learning/03_Advanced NLP with Python for Machine Learning/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/y_test.csv', index=False, header=True)