In [190]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import time
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import gutenberg, stopwords
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [4]:
len(alice_doc)

34363

In [5]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)

print(sentences.head())
print(sentences.shape)

                                                   0        1
0  (Alice, was, beginning, to, get, very, tired, ...  Carroll
1  (So, she, was, considering, in, her, own, mind...  Carroll
2  (There, was, nothing, so, VERY, remarkable, in...  Carroll
3                                      (Oh, dear, !)  Carroll
4                         (I, shall, be, late, !, ')  Carroll
(5318, 2)


In [6]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(200)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    T0=time.time()
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    t0=time.time()
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = (token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 ))
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 100 == 0:
            print("Processing row {}".format(i))
            t1=time.time()
            print('Time for {} rows = {:0.5f}s'.format(i, (t1-t0)))
    T1 = time.time()
    print('Total time = {0:0.5f}s'.format(T1-T0))
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)
print(len(common_words))

309


In [7]:
# Create our data frame with features. This can take a while to run.

word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Time for 0 rows = 0.17154s
Processing row 100
Time for 100 rows = 9.29116s
Processing row 200
Time for 200 rows = 17.52628s
Processing row 300
Time for 300 rows = 25.49949s
Processing row 400
Time for 400 rows = 33.76852s
Processing row 500
Time for 500 rows = 40.23813s
Processing row 600
Time for 600 rows = 48.22478s
Processing row 700
Time for 700 rows = 55.44547s
Processing row 800
Time for 800 rows = 64.27077s
Processing row 900
Time for 900 rows = 71.85670s
Processing row 1000
Time for 1000 rows = 79.51190s
Processing row 1100
Time for 1100 rows = 87.07448s
Processing row 1200
Time for 1200 rows = 95.38172s
Processing row 1300
Time for 1300 rows = 101.58715s
Processing row 1400
Time for 1400 rows = 108.47380s
Processing row 1500
Time for 1500 rows = 115.56490s
Processing row 1600
Time for 1600 rows = 122.92627s
Processing row 1700
Time for 1700 rows = 132.39205s
Processing row 1800
Time for 1800 rows = 141.15434s
Processing row 1900
Time for 1900 rows = 150.84981s

Unnamed: 0,cousin,oh,country,if,thing,try,pass,mouth,world,mouse,...,manage,visit,woman,week,attention,beautiful,why,daughter,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


With 2000 common words the function takes too long

Processing row 0
Time for 0 rows = 10.17418s
Processing row 100
Time for 100 rows = 548.92410s
Processing row 200
Time for 200 rows = 1028.07908s
Processing row 300
Time for 300 rows = 1487.26214s
Processing row 400
Time for 400 rows = 1941.50908s
Processing row 500
Time for 500 rows = 2401.70543s
Processing row 600
Time for 600 rows = 3042.22495s
Processing row 700
Time for 700 rows = 3443.65342s
Processing row 800
Time for 800 rows = 3822.90835s
Processing row 900
Time for 900 rows = 4141.80880s
Processing row 1000
Time for 1000 rows = 4521.30246s
Processing row 1100
Time for 1100 rows = 4940.44947s
Processing row 1200
Time for 1200 rows = 5361.15024s
Processing row 1300
Time for 1300 rows = 5727.72254s
Processing row 1400
Time for 1400 rows = 6130.96701s
Processing row 1500
Time for 1500 rows = 6499.83867s
Processing row 1600
Time for 1600 rows = 7209.84054s
Processing row 1700
Time for 1700 rows = 8593.04145s
Processing row 1800
Time for 1800 rows = 10007.88465s
Processing row 1900
Time for 1900 rows = 11400.94565s
Processing row 2000
Time for 2000 rows = 12996.87139s
Processing row 2100
Time for 2100 rows = 13917.08054s
Processing row 2200
Time for 2200 rows = 15433.19193s
Processing row 2300
Time for 2300 rows = 16726.30992s
Processing row 2400
Time for 2400 rows = 17762.62421s
Processing row 2500
Time for 2500 rows = 18690.94225s
Processing row 2600
Time for 2600 rows = 19865.10566s

In [8]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.961128526645768

Test set score: 0.8782894736842105


In [9]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 309) (3190,)
Training set score: 0.9200626959247649

Test set score: 0.9041353383458647


In [10]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8836990595611285

Test set score: 0.8740601503759399


In [181]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [12]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [13]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [14]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Time for 0 rows = 0.04289s
Processing row 100
Time for 100 rows = 2.37203s
Processing row 200
Time for 200 rows = 5.38755s
Processing row 300
Time for 300 rows = 8.90265s
Processing row 400
Time for 400 rows = 11.56865s
Processing row 500
Time for 500 rows = 13.62802s
Processing row 600
Time for 600 rows = 15.74547s
Processing row 700
Time for 700 rows = 17.92800s
Processing row 800
Time for 800 rows = 19.94725s
Processing row 900
Time for 900 rows = 22.04791s
Processing row 1000
Time for 1000 rows = 24.24186s
Processing row 1100
Time for 1100 rows = 26.38671s
Processing row 1200
Time for 1200 rows = 28.94586s
Processing row 1300
Time for 1300 rows = 30.49934s
Processing row 1400
Time for 1400 rows = 32.12400s
Processing row 1500
Time for 1500 rows = 34.22155s
Processing row 1600
Time for 1600 rows = 36.89616s
Total time = 38.60148s
done


In [15]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6823266219239373


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1552,117
Carroll,735,278


# Challenge 0: See what we can do to improve LogReg test score

### Ideas: 
- other modeling techniques (SVM?)
- make more features (POS, grammar, phrases, etc)
- sentence level features: (number of words, amount of punctuation)
- contextual information: (length of previous and next sentence, words repeated from one sentence to the next, etc)


In [160]:
# find the parts of speech
def pos(df):
    # initialize empty list to collect unique keys and values
    parts_of_speech = []
    dicts = []
    
    # get the sentences from df to find part of speech
    for sentence in df['text_sentence']:
        p_o_s = dict(Counter([token.pos_ for token in sentence]))
        dicts.append(p_o_s)
        for label in p_o_s:
            parts_of_speech.append(label)
            
    # get the unique values for each part of speech
    labels_pos = np.unique(parts_of_speech)

    # dataframe to store values
    count_df = pd.DataFrame(dicts, columns=labels_pos)
    
    # fill nan with 0
    count_df.fillna(0,inplace=True)
    
    # concatenate the old dataframe with parts of speech
    fin_df=pd.concat([df,count_df],1)
    return fin_df

In [161]:
# add parts of speech count to dataframe
df = pos(word_counts)
print(df.head())

  cousin oh country if thing try pass mouth world mouse ...   DET INTJ  NOUN  \
0      0  0       0  0     0   0    0     0     0     0 ...   5.0  0.0  12.0   
1      0  0       0  0     0   0    0     0     0     0 ...   6.0  0.0   8.0   
2      0  1       0  0     0   0    0     0     0     0 ...   3.0  2.0   2.0   
3      0  1       0  0     0   0    0     0     0     0 ...   0.0  2.0   0.0   
4      0  0       0  0     0   0    0     0     0     0 ...   0.0  0.0   0.0   

   NUM PART PRON PROPN PUNCT  VERB    X  
0  0.0  2.0  3.0   2.0  10.0  13.0  0.0  
1  0.0  1.0  4.0   2.0   7.0  11.0  0.0  
2  0.0  1.0  2.0   2.0   4.0   5.0  0.0  
3  0.0  0.0  0.0   0.0   1.0   0.0  0.0  
4  0.0  0.0  1.0   0.0   2.0   2.0  0.0  

[5 rows x 325 columns]


In [162]:
# add a word count for each sentence
def num_of_words(df):
    
    num_words = []
    for i, sentence in enumerate(df['text_sentence']):
        counts = [token.text for token in sentence if not token.is_punct]
        num_words.append(len(counts))
    df['num_words'] = num_words
    return df

In [163]:
df = num_of_words(df)
#print(df.head())
print(df.X[df.X>0])
df = df.drop('X',1)

3633    1.0
3850    1.0
Name: X, dtype: float64


In [271]:
lr = LogisticRegression(C=1)
Y = df['text_source']
X = np.array(df.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 323) (3190,)
Training set score: 0.9225705329153605

Test set score: 0.9130639097744361


In [272]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=.7)
Y = df['text_source']
X = df.drop(['text_sentence','text_source'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

train = svc.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

(3190, 323) (3190,)
Training set score: 0.9244514106583072

Test set score: 0.9116541353383458


In [273]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
Y = df['text_source']
X = np.array(df.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

train = bnb.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', bnb.score(X_train, y_train))
print('\nTest set score:', bnb.score(X_test, y_test))

(3190, 323) (3190,)
Training set score: 0.8840125391849529

Test set score: 0.8773496240601504


In [274]:
emma_df = pd.DataFrame()
emma_df = pos(emma_bow)
emma_df = num_of_words(emma_df)
print(emma_df.head())
print(emma_df.shape)
print(df.shape)

  cousin oh country if thing try pass mouth world mouse    ...     DET INTJ  \
0      0  0       0  0     0   0    0     0     1     0    ...     4.0  0.0   
1      0  0       0  0     0   0    0     0     0     0    ...     4.0  0.0   
2      0  0       0  0     0   0    0     0     0     0    ...     3.0  0.0   
3      0  0       0  0     0   0    0     0     0     0    ...     3.0  0.0   
4      0  0       0  0     0   0    0     0     0     0    ...     0.0  0.0   

  NOUN  NUM PART PRON PROPN PUNCT VERB num_words  
0  6.0  2.0  2.0  1.0   2.0   8.0  6.0        41  
1  8.0  1.0  1.0  1.0   0.0   5.0  3.0        33  
2  9.0  0.0  1.0  1.0   0.0   3.0  8.0        41  
3  5.0  1.0  1.0  0.0   5.0   4.0  2.0        27  
4  0.0  0.0  0.0  1.0   0.0   0.0  1.0         2  

[5 rows x 325 columns]
(1669, 325)
(5318, 325)


In [275]:
# Combine the Emma sentence data with the Alice data from the test set.
alice_train = X_train[y_train[y_train=='Carroll'].index]
emma_data = emma_df.drop(['text_sentence','text_source'], 1)
X_Emma_test = np.concatenate([alice_train, emma_data], axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_df.shape[0])])


In [276]:
# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.680089485458613


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1532,137
Carroll,721,292


# Try new books

In [171]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [241]:
# Clean the Moby Dick data.
melville = gutenberg.raw('melville-moby_dick.txt')
num_chars = len(gutenberg.raw('melville-moby_dick.txt'))
num_words = len(gutenberg.words('melville-moby_dick.txt'))
alice_char = len(gutenberg.raw('carroll-alice.txt'))
print('Number of words in Moby Dick:',num_words)
print('Number of Characters in Moby Dick', num_chars)
print('Number of Characters in Alice', alice_char)
melville = re.sub(r'VOLUME \w+', '', melville)
melville = re.sub(r'CHAPTER \w+', '', melville)
melville = text_cleaner(melville)

# Moby dick is too large to tokenize- sample down same size as alice
start = 20919
end = 20919+alice_char
melville = melville[start:end]
print(melville[:100])

# Parse our cleaned data.
melville_doc = nlp(melville)

# Get sentences
melville_sents = [[sent, "Melville"] for sent in melville_doc.sents]
melville_sentences = pd.DataFrame(melville_sents)

# Get bag of words
melville_bow = bow_features(melville_sentences, common_words)

# Add additional features
melville_df = pd.DataFrame()
melville_df = pos(melville_bow)
melville_df = num_of_words(melville_df)

Number of words in Moby Dick: 260819
Number of Characters in Moby Dick 1242990
Number of Characters in Alice 144395
Call me Ishmael. Some years ago never mind how long precisely having little or no money in my purse,
Processing row 0
Time for 0 rows = 0.01562s
Processing row 100
Time for 100 rows = 0.87128s
Processing row 200
Time for 200 rows = 1.60265s
Processing row 300
Time for 300 rows = 2.62646s
Processing row 400
Time for 400 rows = 3.66332s
Processing row 500
Time for 500 rows = 4.60979s
Processing row 600
Time for 600 rows = 5.45877s
Processing row 700
Time for 700 rows = 6.20634s
Processing row 800
Time for 800 rows = 7.06966s
Processing row 900
Time for 900 rows = 8.07613s
Processing row 1000
Time for 1000 rows = 8.99900s
Processing row 1100
Time for 1100 rows = 9.82930s
Total time = 10.67689s


In [277]:
# alice in wonderland vs moby dick
alice_train = X_train[y_train[y_train=='Carroll'].index]
melville_data = melville_df.drop(['text_sentence','text_source'], 1)
X_Melville_test = np.concatenate([alice_train, melville_data], axis=0)
y_Melville_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Melville'] * melville_df.shape[0])])

In [279]:
print(y_train[y_train=='Carroll'].index)

Int64Index([1300,  422, 1182,  998,  935, 1292, 1504,  249,  305,  273,
            ...
             544,  423,  659,  797,  755,   99,  537,  705, 1033, 1653],
           dtype='int64', length=1013)


In [247]:
# Model.
print('\nTest set score:', lr.score(X_Melville_test, y_Melville_test))
lr_Melville_predicted = lr.predict(X_Melville_test)
pd.crosstab(y_Melville_test, lr_Melville_predicted)


Test set score: 0.1343146274149034


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,721,292
Melville,921,240


In [254]:
print(lr_Melville_predicted)

['Carroll' 'Austen' 'Austen' ... 'Austen' 'Carroll' 'Austen']


In [287]:
# persuasion vs moby dick
persuasion_train = X_train[y_train[y_train=='Austen'].index]
melville_data = melville_df.drop(['text_sentence','text_source'], 1)
X_Melville_test = np.concatenate([persuasion_train, melville_data], axis=0)
y_Melville_test = pd.concat([y_train[y_train=='Austen'],
                         pd.Series(['Melville'] * melville_df.shape[0])])

IndexError: index 3709 is out of bounds for axis 0 with size 3190

In [251]:
# Model.
print('\nTest set score:', lr.score(X_Melville_test, y_Melville_test))
lr_Melville_predicted = lr.predict(X_Melville_test)
pd.crosstab(y_Melville_test, lr_Melville_predicted)


Test set score: 0.4661473936488916


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1556,621
Melville,921,240


In [253]:
print(lr_Melville_predicted)

['Carroll' 'Austen' 'Austen' ... 'Austen' 'Carroll' 'Austen']


The predicted author is always going to be Austen or Carroll because the data was trained on Alice in Wonderland (Lewis Carroll) and Persuasion (Jane Austen).

Also, index for AUSTEN is out of range.  The index for Carroll comes first due to how the data was created. Not sure if data is matched appropriately when testing the EMMA data.

In [289]:
print(X_train.shape)
#print([y_train[y_train=='Austen']])
#print([y_train[y_train=='Carroll']])
print(y_train)


(3190, 323)
1300    Carroll
2125     Austen
3709     Austen
3244     Austen
422     Carroll
3515     Austen
1182    Carroll
998     Carroll
2466     Austen
4637     Austen
935     Carroll
2952     Austen
1292    Carroll
2730     Austen
4555     Austen
3579     Austen
4179     Austen
1504    Carroll
3138     Austen
2719     Austen
4024     Austen
249     Carroll
3521     Austen
305     Carroll
273     Carroll
3978     Austen
3559     Austen
2982     Austen
4878     Austen
1473    Carroll
         ...   
423     Carroll
3219     Austen
659     Carroll
797     Carroll
755     Carroll
2008     Austen
99      Carroll
2496     Austen
1871     Austen
2046     Austen
4851     Austen
5072     Austen
2163     Austen
2893     Austen
537     Carroll
1701     Austen
2897     Austen
2222     Austen
2135     Austen
2599     Austen
705     Carroll
3468     Austen
4373     Austen
1033    Carroll
4859     Austen
4931     Austen
3264     Austen
1653    Carroll
2607     Austen
2732     Austen
Name: text_s

In [293]:
y_train = y_train.reset_index(drop=True)
print(y_train)
print([y_train[y_train=='Austen'].index])
print([y_train[y_train=='Carroll'].index])

0       Carroll
1        Austen
2        Austen
3        Austen
4       Carroll
5        Austen
6       Carroll
7       Carroll
8        Austen
9        Austen
10      Carroll
11       Austen
12      Carroll
13       Austen
14       Austen
15       Austen
16       Austen
17      Carroll
18       Austen
19       Austen
20       Austen
21      Carroll
22       Austen
23      Carroll
24      Carroll
25       Austen
26       Austen
27       Austen
28       Austen
29      Carroll
         ...   
3160    Carroll
3161     Austen
3162    Carroll
3163    Carroll
3164    Carroll
3165     Austen
3166    Carroll
3167     Austen
3168     Austen
3169     Austen
3170     Austen
3171     Austen
3172     Austen
3173     Austen
3174    Carroll
3175     Austen
3176     Austen
3177     Austen
3178     Austen
3179     Austen
3180    Carroll
3181     Austen
3182     Austen
3183    Carroll
3184     Austen
3185     Austen
3186     Austen
3187    Carroll
3188     Austen
3189     Austen
Name: text_source, Lengt

Now data should match X_train correctly. Instead of using the original indices from the full dataset.

In [294]:
# Combine the Emma sentence data with the Alice data from the test set.
alice_train = X_train[y_train[y_train=='Carroll'].index]
emma_data = emma_df.drop(['text_sentence','text_source'], 1)
X_Emma_test = np.concatenate([alice_train, emma_data], axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_df.shape[0])])


In [295]:
# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.8791946308724832


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1532,137
Carroll,187,826


In [296]:
# alice in wonderland vs moby dick
alice_train = X_train[y_train[y_train=='Carroll'].index]
melville_data = melville_df.drop(['text_sentence','text_source'], 1)
X_Melville_test = np.concatenate([alice_train, melville_data], axis=0)
y_Melville_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Melville'] * melville_df.shape[0])])

In [297]:
# Model.
print('\nTest set score:', lr.score(X_Melville_test, y_Melville_test))
lr_Melville_predicted = lr.predict(X_Melville_test)
pd.crosstab(y_Melville_test, lr_Melville_predicted)


Test set score: 0.37994480220791166


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,187,826
Melville,921,240


In [298]:
# persuasion vs moby dick
persuasion_train = X_train[y_train[y_train=='Austen'].index]
melville_data = melville_df.drop(['text_sentence','text_source'], 1)
X_Melville_test = np.concatenate([persuasion_train, melville_data], axis=0)
y_Melville_test = pd.concat([y_train[y_train=='Austen'],
                         pd.Series(['Melville'] * melville_df.shape[0])])

In [299]:
# Model.
print('\nTest set score:', lr.score(X_Melville_test, y_Melville_test))
lr_Melville_predicted = lr.predict(X_Melville_test)
pd.crosstab(y_Melville_test, lr_Melville_predicted)


Test set score: 0.634212103055722


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,2117,60
Melville,921,240


I don't believe I am testing the correctly.  The number of Melvile goods and bad is not changing, most likely because the model is trained on different authors.  I think to continue to use this mess here I would need to call Melville the author I have replaced with him to test against the other. For example, I would have to label Melville as Carroll to test against austen and vice versa.