# NLP !

For this homework we will perform several steps of the data cleaning and preparation of the data. We will perform some text data classification into spam or ham categories

In [63]:
#Import all the necessary packages
import pandas as pd
from nltk import stem
from nltk.corpus import stopwords
import re
import string
import nltk
import enchant

# sklearn Libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix

# Want to get rid of later.
import sys  
!{sys.executable} -m pip install contractions
import contractions


from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk.corpus import gutenberg


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize




#### Import the data. Please rename columns to label and text.

In [64]:
# Importing Data sheet
df = pd.read_csv("spam.csv", encoding = "latin-1")

# Dropping Unneccessary Coumns
df.drop(labels=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# Renaming the columns
df.rename({'v1':'label', "v2":"text"}, axis=1, inplace=True)

# Printing Head to show transformation
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,ham,Even my brother is not like to speak with me. ...


### 1) Let's normalize the text data, i.e. remove punctuation, make lower case, expand contractions, remove stopwords and so on...

In [65]:
list_2_norm = df['text'].to_list()
norm_list = []
for line in enumerate(list_2_norm):
    print("Raw: ", line[1])

    # Making Whole String Lowercase
    new_line = str(line[1]).lower()
        
    # Expanding Contractions
    new_sentence = []
    for word in new_line.split():
        new_sentence.append(contractions.fix(word))
    new_line = ' '.join(new_sentence)

    # Removing punctuation
    new_line = new_line.translate(str.maketrans('', '', string.punctuation))

    # Removing StopWords
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(new_line)
    new_line = ' '.join([word for word in word_tokens if word not in stop_words])
    print("Scrubbed", new_line)
    norm_list.append(new_line)


Raw:  Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Scrubbed go jurong point crazy available bugis n great world la e buffet cine got amore wat
Raw:  Ok lar... Joking wif u oni...
Scrubbed ok lar joking wif oni
Raw:  U dun say so early hor... U c already then say...
Scrubbed dun say early hor c already say
Raw:  Nah I don't think he goes to usf, he lives around here though
Scrubbed nah think goes usf lives around though
Raw:  Even my brother is not like to speak with me. They treat me like aids patent.
Scrubbed even brother like speak treat like aids patent
Raw:  As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
Scrubbed per request melle melle oru minnaminunginte nurungu vettam set callertune callers press 9 copy friends callertune
Raw:  I'm gonna be home soon and i don't want to talk about this stuff anymore t

In [66]:
df['text_scrubbed'] = norm_list
df.head()

Unnamed: 0,label,text,text_scrubbed
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,ham,U dun say so early hor... U c already then say...,dun say early hor c already say
3,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
4,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent


### 2) Stem and Tokenize the messages
# Will have to come back through this

In [67]:
# First we want to convert the sentence into word tokens before stemming
ps = LancasterStemmer()
new_messages = []
for line in norm_list:
    token_line = nltk.word_tokenize(line)  # Tokenized the Line
    stemmed_line = [ps.stem(word) for word in token_line]  # Stemming the Different Words in the Sentence
    new_messages.append(stemmed_line)

print(new_messages[0:5])

[['go', 'jurong', 'point', 'crazy', 'avail', 'bug', 'n', 'gre', 'world', 'la', 'e', 'buffet', 'cin', 'got', 'am', 'wat'], ['ok', 'lar', 'jok', 'wif', 'on'], ['dun', 'say', 'ear', 'hor', 'c', 'already', 'say'], ['nah', 'think', 'goe', 'usf', 'liv', 'around', 'though'], ['ev', 'broth', 'lik', 'speak', 'tre', 'lik', 'aid', 'pat']]


In [68]:
df['text_token'] = new_messages
df.head()

Unnamed: 0,label,text,text_scrubbed,text_token
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, avail, bug, n, gre,..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni,"[ok, lar, jok, wif, on]"
2,ham,U dun say so early hor... U c already then say...,dun say early hor c already say,"[dun, say, ear, hor, c, already, say]"
3,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,"[nah, think, goe, usf, liv, around, though]"
4,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent,"[ev, broth, lik, speak, tre, lik, aid, pat]"


### 3) Split your data into a training and testing set (fillin the #Indepedent variable and #Dependent variable below)

In [69]:
#Let's first map spam/ham label to a numberic value
df.label = df.label.map({"ham":1, "spam":0})
df.head()

Unnamed: 0,label,text,text_scrubbed,text_token
0,1,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, avail, bug, n, gre,..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif oni,"[ok, lar, jok, wif, on]"
2,1,U dun say so early hor... U c already then say...,dun say early hor c already say,"[dun, say, ear, hor, c, already, say]"
3,1,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,"[nah, think, goe, usf, liv, around, though]"
4,1,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent,"[ev, broth, lik, speak, tre, lik, aid, pat]"


In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text_scrubbed'],
    df['label'], 
    test_size = 0.1, random_state = 1)

X_test    

1078                       tee hee lecture cheery bye bye
4028                    pick phone right pls send message
958                                      sorry call later
4642             hey iouri gave number wylie ryans friend
4674    hard true much show amp express love someoneth...
                              ...                        
3529                   played smash bros ltgt religiously
5488    got takes 2 take part wrc rally oz lucozade en...
5134    hi ya babe x 4goten bout scammers getting smar...
5       per request melle melle oru minnaminunginte nu...
1289    wen lovable bcums angry wid dnt take seriously...
Name: text_scrubbed, Length: 558, dtype: object

In [None]:
# Checking size of the training and test shape
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5014,) (558,) (5014,) (558,)


### 4) Transform the data using the TF-IDF method and fit the SVM model on the data.

In [None]:
# Get built-in TF-IDF method
vectorizer = TfidfVectorizer()

In [72]:
#Let's perform our bag of words model fit on the train and test data :
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Creating two different documents
ham_sentences = df['text_scrubbed'][df['label'] == 1].to_list().copy()
spam_sentences = df['text_scrubbed'][df['label'] == 0].to_list().copy()

In [None]:
#This steps learns the vocabulary in the documents and returns the term-document matrix
vectors = vectorizer.fit_transform([firstDoc, secondDoc])

AttributeError: 'list' object has no attribute 'lower'