# 4.4.2 [NLP: as a Supervised Problem](https://courses.thinkful.com/data-201v1/project/4.4.2)

Supervised NLP:
    * Requires a pre-labelled dataset for training and testing 
    * generally interested in categorizing text in various ays

Feature Generation - Bag of Words: for each sentence count how many times each words appears. We then use those counts as features (is this every word or just the relevant entitites)

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time

from sklearn.svm import SVC
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [100]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')
leaves = gutenberg.raw('whitman-leaves.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)
leaves = text_cleaner(leaves)

# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)
leaves_doc = nlp(leaves)

# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
leaves_sents = [[sent, "Whitman"] for sent in leaves_doc.sents]
leaves_sents = leaves_sents[:len(alice_sents)]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + leaves_sents)
df_leaves_sents = pd.DataFrame(leaves_sents)

# Load and clean the data.

In [101]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop
                and not token.is_space]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [102]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    start_time = time.time()
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        
        if i % 50 == 0:
            print("Processing row {}".format(i))
            print("Calcuation Time %.3f"%((time.time()-start_time)/60.0))
            start_time = time.time()
            
    return df

In [105]:
# Parse the sentences from the 3 datasets
# This takes forever
alice_wordcounts = bow_features(pd.DataFrame(alice_sents), common_words)
persuasion_wordcounts = bow_features(pd.DataFrame(persuasion_sents), common_words)
leaves_wordcounts = bow_features(pd.DataFrame(leaves_sents), common_words)

# Create 1 dataframe with all 3 datasources, 
# keep index so the sentence number can be used 
# if dataset pulled from pickle file (which was not accepting spacy objects)
all_wordcounts = alice_wordcounts.append(
    persuasion_wordcounts,ignore_index= False).append(
    leaves_wordcounts, ignore_index=False)

# Creating index that does not repeat itself, original index can still 
# be accessed for matching purposes
all_wordcounts = all_wordcounts.reset_index()

# Create function for pickling a file
import pickle
def pickle_file(data, file_name):
    file = open(file_name, 'wb')
    pickle.dump(data,file, protocol=-1)
    file.close()

# Pickle word count, must drop the scpacy objecte
pickle_file(all_wordcounts.drop(['text_sentence'], 1), 'nlp_wordcounts')

Processing row 0
Calcuation Time 0.054
Processing row 50
Calcuation Time 1.069
Processing row 100
Calcuation Time 1.488
Processing row 150
Calcuation Time 1.068
Processing row 200
Calcuation Time 1.075
Processing row 250
Calcuation Time 1.022
Processing row 300
Calcuation Time 1.131
Processing row 350
Calcuation Time 0.986
Processing row 400
Calcuation Time 1.065
Processing row 450
Calcuation Time 0.644
Processing row 500
Calcuation Time 1.317
Processing row 550
Calcuation Time 0.986
Processing row 600
Calcuation Time 1.031
Processing row 650
Calcuation Time 1.136
Processing row 700
Calcuation Time 1.090
Processing row 750
Calcuation Time 1.138
Processing row 800
Calcuation Time 0.940
Processing row 850
Calcuation Time 0.927
Processing row 900
Calcuation Time 0.841
Processing row 950
Calcuation Time 0.919
Processing row 1000
Calcuation Time 1.282
Processing row 1050
Calcuation Time 0.846
Processing row 1100
Calcuation Time 1.583
Processing row 1150
Calcuation Time 1.148
Processing row 

In [165]:
all_wordcounts = alice_wordcounts.append(
    persuasion_wordcounts.iloc[:len(alice_sents),:],ignore_index= True).append(
    leaves_wordcounts, ignore_index=True)

In [166]:
all_wordcounts.groupby('text_source').count()['text_sentence']

text_source
Austen     1669
Carroll    1669
Whitman    1669
Name: text_sentence, dtype: int64

# Challenge 0:

Logistic regression best performance: 93% 

Exploration Areas:
* Other modeling techniques (SVM?), 
* More spaCy features - grammar, phrases, POS, etc.,  
* Sentence level features - (number of words, amount of punctuation)
* Including contextual info - words repeated from one sentence to the next, etc

Make sure to design your models on the test set, or use cross_validation with multiple folds, and see if you can get accuracy above 90%.  


# Try Support Vector Machine


In [168]:
# Creating the training set
carroll_austen = all_wordcounts[(
    all_wordcounts['text_source']=='Carroll')|(
    all_wordcounts['text_source']=='Austen')].reset_index(drop=True)

In [169]:
X = np.array(carroll_austen.drop(['text_sentence','text_source'], 1))
Y = carroll_austen['text_source']

X_train, X_test, y_train, y_test = train_test_split(X,Y,
                                                    test_size=0.4,
                                                    random_state=0)

In [170]:
svc = SVC(gamma='auto', kernel='rbf', C = 0.1)
train = svc.fit(X_train, y_train)

print('Training set score: %.3f'% svc.score(X_train, y_train))
print('\nTest set score: %.3f'% svc.score(X_test, y_test))

Training set score: 0.510

Test set score: 0.484


In [132]:
svc = SVC(gamma='auto', kernel='rbf', C = 2)
train = svc.fit(X_train, y_train)

print('Training set score: %.3f'% svc.score(X_train, y_train))
print('\nTest set score: %.3f'% svc.score(X_test, y_test))

Training set score: 0.687

Test set score: 0.693


In [133]:
svc = SVC(gamma='auto', kernel='rbf', C = 30)
train = svc.fit(X_train, y_train)

print('Training set score: %.3f'% svc.score(X_train, y_train))
print('\nTest set score: %.3f'% svc.score(X_test, y_test))

Training set score: 0.866

Test set score: 0.850


In [134]:
svc = SVC(gamma='auto', kernel='rbf', C = 100)
train = svc.fit(X_train, y_train)

print('Training set score: %.3f'% svc.score(X_train, y_train))
print('\nTest set score: %.3f'% svc.score(X_test, y_test))

Training set score: 0.900

Test set score: 0.883


### Support Vector Classifier is overfitting as we increase C (which is to be expected). I don't have huge confidence in this performing well on Walt Whitman but lets see how frequently the model is able to correctly identify Louis Carroll


# Challenge 1:
Find out whether your new model is good at identifying 
* Alice in Wonderland vs any other work, 
* Persuasion vs any other work, or 
* Austen vs any other work.  

This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

Record your work for each challenge in a notebook and submit it below.

In [138]:
# Creating the data set with just Carroll and Walt Whitman
carroll_whitman = all_wordcounts[(
    all_wordcounts['text_source']=='Carroll')|(
    all_wordcounts['text_source']=='Whitman')].drop(
    'index',1).reset_index(drop=True)

In [151]:
whitman = all_wordcounts[all_wordcounts['text_source']=='Whitman'].drop(
    'index',1).reset_index(drop=True)

In [172]:
# Combine the Emma sentence data with the Alice data from the test set.
X_test_whitman = np.concatenate((X_train[y_train[y_train=='Carroll'].index],
    whitman.drop(['text_sentence','text_source'], 1)), axis=0)
y_test_whitman = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Whitman'] * whitman.shape[0])])

In [171]:
svc = SVC(gamma='auto', kernel='rbf', C = 100)
train = svc.fit(X, Y)

print('Score Leaves: %.3f'% svc.score(X_test_whitman, y_test_whitman))
# Model.

svc_whitman_predicted = svc.predict(X_test_whitman)
pd.crosstab(y_test_whitman, svc_whitman_predicted)

Score Leaves: 0.131


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,661,352
Whitman,1277,392


In [158]:
svc = SVC(gamma='auto', kernel='rbf', C = 100)
train = svc.fit(X_train, y_train)

print('Score Leaves: %.3f'% svc.score(X_test_whitman, y_test_whitman))
# Model.

svc_whitman_predicted = svc.predict(X_test_whitman)
pd.crosstab(y_test_whitman, svc_whitman_predicted)

Score Leaves: 0.089


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,774,239
Whitman,1569,100


This model failed to accurately predict Carroll well smh. 
I'm going to try this where I use all of the Carroll/Austen text to train the model and then all of the Carroll/Whitman to text the model's ability to accurately classify.
I don't see this going well since the classification is using information from Austen and Carroll to make predictions rather than Carroll/NOT Carroll. Setting up the model as Carroll NOT Carroll may have had more success.

In [173]:
# Train the model
svc = SVC(gamma='auto', kernel='rbf', C = 100)
train = svc.fit(X, Y)

Y_whitman = carroll_whitman['text_source']
X_whitman = carroll_whitman.drop(['text_sentence','text_source'], 1)
print('Score Leaves: %.3f'% svc.score(X_test_whitman, y_test_whitman))

# To get a random set of sentences I'm creating a training and tear set
# Could also just sort my data
svc_whitman_predicted = svc.predict(X_whitman)
pd.crosstab(Y_whitman, svc_whitman_predicted)

Score Leaves: 0.176


col_0,Austen,Carroll
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,163,1506
Whitman,1277,392


In [175]:
print('Percent Correct: %.3f'% ((1277+1506)/len(Y_whitman)))

Percent Correct: 0.834


If I do the calculatin myself of Carroll not Carroll its actually not that awful.

# Exploring Spacy

In [60]:
# what we can extract from tokens
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,       # index of where the token starts
        token.lemma_,    # the base word
        token.is_punct,  # bool of if the token is punctuation
        token.is_space,  # bool of if the token is a space
        token.shape_,    # capital/lower case X's in length of the token
        token.pos_,      # the part of speach of the token
        token.tag_       # seem to be abreiations for the pos
    ))

Next	0	next	False	False	Xxxx	ADJ	JJ
week	5	week	False	False	xxxx	NOUN	NN
I	10	-PRON-	False	False	X	PRON	PRP
'll	11	will	False	False	'xx	VERB	MD
  	15	  	False	True	  	SPACE	_SP
be	17	be	False	False	xx	VERB	VB
in	20	in	False	False	xx	ADP	IN
Madrid	23	madrid	False	False	Xxxxx	PROPN	NNP
.	29	.	True	False	.	PUNCT	.


In [77]:
# sentence tagging
doc = nlp("These are apples. These are oranges.")
[print('{}'.format(sent)) for sent in doc.sents]

for sent in doc.sents:
    print(sent)

These are apples.
These are oranges.
These are apples.
These are oranges.


In [110]:
for ent in sentences.iloc[0,0].ents:
    #print(len(token.shape_))
    print(ent.text)
    print(ent.label_)

Alice
PERSON


In [174]:
list_pos = [token.pos_ for token in sentences.iloc[0,0]]

In [176]:
Counter(list_pos)

Counter({'PROPN': 2,
         'VERB': 13,
         'PART': 2,
         'ADV': 3,
         'ADJ': 3,
         'ADP': 8,
         'NOUN': 12,
         'DET': 5,
         'PUNCT': 10,
         'CCONJ': 6,
         'PRON': 3})

In [160]:
[item[0] for item in Counter(allwords).most_common(10)]

['-PRON-', 'say', 'alice', 'be', 'not', 'think', 'go', 'little', 'the', 'know']

In [165]:
def bag_of_nouns(text):
    noun_chunks = [str(noun_chunk)
                   for noun_chunk 
                   in text.noun_chunks]
    return [item[0] for item in Counter(noun_chunks).most_common(20)]

alice_nouns = bag_of_nouns(alice_doc)
persuasion_nouns = bag_of_nouns(persuasion_doc)


In [7]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(1000)]

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [118]:
for token in sentences.iloc[0,0][:7]:
    #print(len(token.shape_))
    print(token.pos_)
    #print(ent.label_)#

PROPN
VERB
VERB
PART
VERB
ADV
ADJ


In [28]:
# alice_doc = Doc
# t = token 

# for every token in the doc print the token
[t for t in alice_doc]

# for every token in the doc print the token as a string
[t.text for t in alice_doc[:15]]

# length of the doc (in tokens)
len(alice_doc)

34363

In [59]:
list(alice_doc[3].subtree)

[to]

# Bag some words
* exclude stopwords & punctuation
* stick to the most common 2000 lemmas for each text

# Random Forest on BOW
* overfitting is a known problem when using bag of words since it basically involves throwing a massive number of features at a model
* some features will capture noise in the training set 

In [17]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score: %.3f'% rfc.score(X_train, y_train))
print('\nTest set score: %.3f'% rfc.score(X_test, y_test))

Training set score: 0.987460815047022

Test set score: 0.8895676691729323


In [18]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score: %.3f'% rfc.score(X_train, y_train))
print('\nTest set score: %.3f'% rfc.score(X_test, y_test))

Training set score: 0.9862068965517241

Test set score: 0.8905075187969925


In [55]:
word_counts.drop(['text_sentence','text_source'], 1).columns

Index(['mary', 'town', 'leave', 'continue', 'aware', 'pardon', 'nearer',
       'curiosity', 'are', 'thought',
       ...
       'day', 'doubt', 'picture', 'beat', 'sharp', 'growl', 'hall', 'game',
       'vanish', 'fan'],
      dtype='object', length=774)

In [57]:
df_importance = pd.DataFrame(data = {'importances':rfc.feature_importances_, 'features':word_counts.drop(['text_sentence','text_source'], 1).columns})
df_importance.sort_values(by='importances', ascending=False)

Unnamed: 0,importances,features
474,0.078957,alice
566,0.056796,be
282,0.053473,say
565,0.037857,not
753,0.027391,-PRON-
506,0.017797,will
373,0.015814,anne
550,0.014607,turtle
676,0.013414,queen
526,0.013019,mrs


## BoW with Logistic Regression

Let's try a technique with some protection against overfitting due to extraneous features – logistic regression with ridge regularization (from ridge regression, also called L2 regularization).

In [36]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score: %.3f'%lr.score(X_train, y_train))
print('\nTest set score: %.3f'%lr.score(X_test, y_test))

(3190, 1561) (3190,)
Training set score: 0.952

Test set score: 0.918


In [37]:
lr = LogisticRegression(penalty='l2', C=.1)
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score: %.3f'% lr.score(X_train, y_train))
print('\nTest set score: %.3f'% lr.score(X_test, y_test))

(3190, 1561) (3190,)
Training set score: 0.897

Test set score: 0.883


In [38]:
lr = LogisticRegression(penalty='l2', C=10)
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score: %.3f'% lr.score(X_train, y_train))
print('\nTest set score: %.3f'% lr.score(X_test, y_test))

(3190, 1561) (3190,)
Training set score: 0.979

Test set score: 0.906


In [39]:
lr = LogisticRegression(penalty='l2', C=100)
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)

print('Training set score: %.3f'% lr.score(X_train, y_train))
print('\nTest set score: %.3f'% lr.score(X_test, y_test))

(3190, 1561) (3190,)
Training set score: 0.985

Test set score: 0.895


Logistic regression performs a bit better than the random forest.  

# BoW with Gradient Boosting

And finally, let's see what gradient boosting can do:

In [40]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score: %.3f'% clf.score(X_train, y_train))
print('\nTest set score: %.3f'% clf.score(X_test, y_test))

Training set score: 0.886

Test set score: 0.874


In [43]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score: %.3f'% clf.score(X_train, y_train))
print('\nTest set score: %.3f'% clf.score(X_test, y_test))

Training set score: 0.8830721003134796

Test set score: 0.8698308270676691


Looks like logistic regression is the winner, but there's room for improvement.

# Same model, new inputs

What if we feed the model a different novel by Jane Austen, like _Emma_?  Will it be able to distinguish Austen from Carroll with the same level of accuracy if we insert a different sample of Austen's writing?

First, we need to process _Emma_ the same way we processed the other data, and combine it with the Alice data:

In [44]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [45]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [46]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [47]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Calcuation Time 0.003
Processing row 50
Calcuation Time 0.072
Processing row 100
Calcuation Time 0.046
Processing row 150
Calcuation Time 0.058
Processing row 200
Calcuation Time 0.073
Processing row 250
Calcuation Time 0.094
Processing row 300
Calcuation Time 0.072
Processing row 350
Calcuation Time 0.061
Processing row 400
Calcuation Time 0.056
Processing row 450
Calcuation Time 0.041
Processing row 500
Calcuation Time 0.054
Processing row 550
Calcuation Time 0.043
Processing row 600
Calcuation Time 0.048
Processing row 650
Calcuation Time 0.047
Processing row 700
Calcuation Time 0.050
Processing row 750
Calcuation Time 0.042
Processing row 800
Calcuation Time 0.045
Processing row 850
Calcuation Time 0.043
Processing row 900
Calcuation Time 0.046
Processing row 950
Calcuation Time 0.050
Processing row 1000
Calcuation Time 0.044
Processing row 1050
Calcuation Time 0.045
Processing row 1100
Calcuation Time 0.056
Processing row 1150
Calcuation Time 0.073
Processing row 

In [14]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.703466666667


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1158,21
Carroll,535,161


In [48]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6498881431767338


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1413,256
Carroll,683,330
