# NLP !

For this homework we will perform several steps of the data cleaning and preparation of the data. We will perform some text data classification into spam or ham categories

In [30]:
#Import all the necessary packages
import pandas as pd
import numpy as np
from collections import defaultdict

# nltk Libraries
from nltk import stem
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
import nltk
from nltk.stem import PorterStemmer

# sklearn Libraries
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from collections import defaultdict

# Regulare Expressions import
import re
import string


# Imported library to deal with contractions
import sys  
# !{sys.executable} -m pip install contractions
import contractions

#### Import the data. Please rename columns to label and text.

In [31]:
# Importing Data sheet
df = pd.read_csv("spam.csv", encoding = "latin-1")

# Dropping Unneccessary Coumns
df.drop(labels=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# Renaming the columns
df.rename({'v1':'label', "v2":"text"}, axis=1, inplace=True)

# Printing Head to show transformation
df.head()

Unnamed: 0,label,text
0,ham,Stuning even for the non-gamer:
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
df.shape

(5573, 2)

### 1) Let's normalize the text data, i.e. remove punctuation, make lower case, expand contractions, remove stopwords and so on...

In [33]:
# Step 1: Drop anything that is a blank row just incase it came in
df['text'].dropna(inplace=True)
df.shape
# Note, there were no na drops

(5573, 2)

In [34]:
# Step 2: Make everything lower case
df['text'] = [word.lower() for word in df['text']]
df.head()

Unnamed: 0,label,text
0,ham,stuning even for the non-gamer:
1,ham,"go until jurong point, crazy.. available only ..."
2,ham,ok lar... joking wif u oni...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [35]:
# Step 3: Expanding Contractions
df['text'] = [contractions.fix(word) for word in df['text']]
df.head()

Unnamed: 0,label,text
0,ham,stuning even for the non-gamer:
1,ham,"go until jurong point, crazy.. available only ..."
2,ham,ok lar... joking wif you oni...
3,ham,you dun say so early hor... you c already then...
4,ham,"nah i do not think he goes to usf, he lives ar..."


In [36]:
# Step 3: Tokenization of the corpus/df
df['text']= [word_tokenize(line) for line in df['text']]
df.head()

Unnamed: 0,label,text
0,ham,"[stuning, even, for, the, non-gamer, :]"
1,ham,"[go, until, jurong, point, ,, crazy, .., avail..."
2,ham,"[ok, lar, ..., joking, wif, you, oni, ...]"
3,ham,"[you, dun, say, so, early, hor, ..., you, c, a..."
4,ham,"[nah, i, do, not, think, he, goes, to, usf, ,,..."


In [37]:
# Step 4: Removing the stop words

# Creating List of Stop Words
stopwords = nltk.corpus.stopwords.words('english')

no_stop_words = []  # Creating a blank list to store new lines in
for line in df['text']:
    # Appending new line with no stop words to the list
    no_stop_words.append([word for word in line if word not in stopwords])

# reseting text to not have stop words
df['text'] = no_stop_words
df.head()

Unnamed: 0,label,text
0,ham,"[stuning, even, non-gamer, :]"
1,ham,"[go, jurong, point, ,, crazy, .., available, b..."
2,ham,"[ok, lar, ..., joking, wif, oni, ...]"
3,ham,"[dun, say, early, hor, ..., c, already, say, ...]"
4,ham,"[nah, think, goes, usf, ,, lives, around, though]"


In [38]:
# Step 5: Removing the punctation
# Subset Creating List of punctuations
pattern = '[{}]'.format(re.escape(string.punctuation))
pattern

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]'

In [39]:
punc_regex = re.compile(pattern)

# Creating a list of no punctuation
no_punc = []
for line in df['text']:
    no_punc.append(list(filter(None , [punc_regex.sub('', word)  for word in line])))

df['text'] = no_punc
df.head()

Unnamed: 0,label,text
0,ham,"[stuning, even, nongamer]"
1,ham,"[go, jurong, point, crazy, available, bugis, n..."
2,ham,"[ok, lar, joking, wif, oni]"
3,ham,"[dun, say, early, hor, c, already, say]"
4,ham,"[nah, think, goes, usf, lives, around, though]"


In [40]:
#  Step 6: Mapping the labels over to 1/0
df.label = df.label.map({"ham":1, "spam":0})
df.head()

Unnamed: 0,label,text
0,1,"[stuning, even, nongamer]"
1,1,"[go, jurong, point, crazy, available, bugis, n..."
2,1,"[ok, lar, joking, wif, oni]"
3,1,"[dun, say, early, hor, c, already, say]"
4,1,"[nah, think, goes, usf, lives, around, though]"


### 2) Stem and Tokenize the messages

In [41]:
# Stemming
pstemmer = PorterStemmer()

# Creating a list of stemmed words
pstem = []
for line in df['text']:
    pstem.append([pstemmer.stem(word) for word in line])

df['text'] = pstem
df.head()

Unnamed: 0,label,text
0,1,"[stune, even, nongam]"
1,1,"[go, jurong, point, crazi, avail, bugi, n, gre..."
2,1,"[ok, lar, joke, wif, oni]"
3,1,"[dun, say, earli, hor, c, alreadi, say]"
4,1,"[nah, think, goe, usf, live, around, though]"


In [42]:
# Tokenization/Lemmatization

# Mapping all the Nouns verbs adj, adverbs
word_map = defaultdict(lambda : wn.NOUN)
word_map['V'], word_map['J'], word_map['R'] = wn.VERB, wn.ADJ, wn.ADV

tokens = []
for line in enumerate(df['text']):
    # Starting the WordNetLemmatizer()
    lemmatize = WordNetLemmatizer()
    # Maping the word lemmatization and adding it back as a string list to the tokens list
    tokens.append(str([lemmatize.lemmatize(word, word_map[map[0]]) for word, map in pos_tag(line[1])]))
df['text'] = tokens
df.head()


Unnamed: 0,label,text
0,1,"['stune', 'even', 'nongam']"
1,1,"['go', 'jurong', 'point', 'crazi', 'avail', 'b..."
2,1,"['ok', 'lar', 'joke', 'wif', 'oni']"
3,1,"['dun', 'say', 'earli', 'hor', 'c', 'alreadi',..."
4,1,"['nah', 'think', 'goe', 'usf', 'live', 'around..."


### 3) Split your data into a training and testing set (fillin the #Indepedent variable and #Dependent variable below)

In [43]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['label'], 
    test_size = 0.1, random_state = 1)

X_test    

1447             ['cramp', 'stop', 'go', 'back', 'sleep']
2032    ['hi', 'sorri', 'miss', 'call', 'pl', 'call', ...
4432    ['wonder', 'okor', 'great', 'month', 'cherish'...
4643    ['hey', 'iouri', 'give', 'number', 'wyli', 'ry...
5275    ['plea', 'call', '08712402972', 'immedi', 'urg...
                              ...                        
4849    ['want', '2', 'get', 'lay', 'tonight', 'want',...
2506    ['hi', 'darlin', 'hope', 'nice', 'night', 'wis...
2128    ['lol', 'well', 'without', 'could', 'big', 'sa...
2463    ['go', 'death', 'go', 'leav', 'note', 'say', '...
5       ['even', 'brother', 'like', 'speak', 'treat', ...
Name: text, Length: 558, dtype: object

In [44]:
# Checking the shape of the tests
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5015,) (558,) (5015,) (558,)


In [45]:
# Encoding y variabels to arrays
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

### 4) Transform the data using the TF-IDF method and fit the SVM model on the data.

In [46]:
#Fit the model :
# Get built-in TF-IDF method
vectorizer = TfidfVectorizer()
vectorizer.fit(df['text'])
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [47]:
X_train

<5015x7513 sparse matrix of type '<class 'numpy.float64'>'
	with 40846 stored elements in Compressed Sparse Row format>

In [48]:
#Fit in the transformed data to the SVM classifier. Not covered in class, so code is given for fitting 
#the data to the SVM classifier

#Creates an SVM classidier object 
svm = svm.SVC(C=1000)

#Fits the SVM model on the TF-IDF transformed data along with the train labels
svm.fit(X_train, y_train)

SVC(C=1000)

Let's check out the confusion matrix for our test set to review the performance of our model. 

Please comment on how accurate was the model, i.e. what percentage did the model predict correctly ?

In [49]:
# Printing the accuracy of the test
print("Accuracy Score:", accuracy_score(svm.predict(X_test), y_test)*100)

Accuracy Score: 97.4910394265233


In [50]:
svm.predict(X_test)

array([1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [51]:
#Check the accuracy of the model on the 
print("The training score was:", round(svm.score(X_train,y_train)*100,2))

print("The testing score was:", round(svm.score(X_test,y_test)*100,2))

The training score was: 100.0
The testing score was: 97.49


In [52]:
# Creating Confusion Matrix
#We now want to see the prediction our model makes with the review in the test set
svm_pred = svm.predict(X_test)

#By looking at the confusion matrix we get a nice summary of how our model performed :
pd.DataFrame(confusion_matrix(y_test, svm_pred), 
             columns=["Positive", "Negative"], 
             index=["Positive", "Negative"])

Unnamed: 0,Positive,Negative
Positive,56,14
Negative,0,488


The model was extremely accurate. The training model scored 100% while the testing data scored 97.49%
THe confusion matrix shows that there were only 14 False Positives Type 1 Erros and 0 False negatives, type 2 errors. 

### 5) Create a function that will allow you to pass in a phrase and have the classifier tell you if it is spam or not.

In [53]:
def twist_it_turn_it_bop_it(pass_phrase, svm, vectorizer):
    """This function goes through and Scrubs a phrase and then passes it through the classifier to see if it is spam or not
    My appologies for making this a long function that is essentially the code that I wrote above for the scrubbing
    aspect. It was easier to repeat to get the phrase into the proper form. I did this by adding it to a list and reusing code.
    This is a 4am edit due the night before. """
    # Step 1: Make Lower Case
    phrase = []
    phrase.append(pass_phrase)
    phrase  =pd.DataFrame(phrase,columns=['text'])
    

    phrase['text'] = [word.lower() for word in phrase['text']]
    

    # Step 2: Expand Contractions
    phrase['text'] = [contractions.fix(word) for word in phrase['text']]
    

    # Step 3: Tokenize
    phrase['text']= [word_tokenize(line) for line in phrase['text']]
    
    # Step 4: Remove Stopwords
    # Creating List of Stop Words
    stopwords = nltk.corpus.stopwords.words('english')
    
    no_stop_words = []  # Creating a blank list to store new lines in
    for line in phrase['text']:
        # Appending new line with no stop words to the list
        no_stop_words.append([word for word in line if word not in stopwords])
    

    # reseting text to not have stop words
    phrase['text'] = no_stop_words

    # Step 5: Remove Punctuation
    
    # Subset Creating List of punctuations
    pattern = '[{}]'.format(re.escape(string.punctuation))
    punc_regex = re.compile(pattern)

    # Creating a list of no punctuation
    no_punc = []
    for line in phrase['text']:
        no_punc.append(list(filter(None , [punc_regex.sub('', word)  for word in line])))
    phrase['text'] = no_punc
    
    
    # Step 6: Stemming words
    # Stemming
    pstemmer = PorterStemmer()

    # Creating a list of stemmed words
    pstem = []
    for line in phrase['text']:
        pstem.append([pstemmer.stem(word) for word in line])
        
    phrase['text'] = pstem
    
    # Step 7: Lemmatization and Tokens
    word_map = defaultdict(lambda : wn.NOUN)
    word_map['V'], word_map['J'], word_map['R'] = wn.VERB, wn.ADJ, wn.ADV
    tokens = []
    for line in enumerate(phrase['text']):
        # Starting the WordNetLemmatizer()
        lemmatize = WordNetLemmatizer()
        # Maping the word lemmatization and adding it back as a string list to the tokens list
        tokens.append(str([lemmatize.lemmatize(word, word_map[map[0]]) for word, map in pos_tag(line[1])]))
    phrase['text'] = tokens

    phrase_out = vectorizer.transform(phrase['text'])
    prediction = svm.predict(phrase_out)
    if prediction[0] == 1:
        print("The following phrase:", pass_phrase, "\n This is from HAM!")
    elif prediction[0] == 0:
        print("The following phrase:", pass_phrase, "\n This is !!!!!!spam!!!!!!")            
    






In [54]:
# Note that Ham 1 is a modiest change
ham1 = "I DON'T A DATE ON SUNDAY WITH WILL!!"
ham2 = 'whatever, im pretty pissed off.'
spam1 = 'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'
spam2 = 'You are a winner U have been specially selected 2 receive å£1000 cash or a 4* holiday (flights inc) speak to a live operator 2 claim 0871277810810'

In [55]:
twist_it_turn_it_bop_it(ham1, svm, vectorizer)

The following phrase: I DON'T A DATE ON SUNDAY WITH WILL!! 
 This is from HAM!


In [56]:
twist_it_turn_it_bop_it(ham2, svm, vectorizer)

The following phrase: whatever, im pretty pissed off. 
 This is from HAM!


In [57]:
twist_it_turn_it_bop_it(spam1, svm, vectorizer)

The following phrase: WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only. 
 This is !!!!!!spam!!!!!!


In [58]:
twist_it_turn_it_bop_it(spam2, svm, vectorizer)

The following phrase: You are a winner U have been specially selected 2 receive å£1000 cash or a 4* holiday (flights inc) speak to a live operator 2 claim 0871277810810 
 This is !!!!!!spam!!!!!!
