Because the Word2Vec-based model did not return ideal results, we instead will try models that do not attempt to interpret the contexts or meanings of words. This will hopefully obscure content to the model, allowing it to better pick up on style.

Imports.

In [1]:
import numpy as np

import pandas as pd

from ast import literal_eval

from sklearn.feature_extraction.text import CountVectorizer, \
    TfidfVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC

Load `chaps_df` and undo array-to-list.

In [2]:
chaps_df = pd.read_csv('../data/animorphs_chaps.csv')

for col in ['clean','clean_no_stops', 'vec_clean']:
    chaps_df[col] = chaps_df[col].map(literal_eval) # as lists
    
for col in ['vec_clean']:
    chaps_df[col] = chaps_df[col].map(np.array) # as array
    
chaps_df.head()

Unnamed: 0,book,chapter,text,authenticity,clean,clean_no_stops,vec_clean
0,1,1,"My name is Jake. That's my first name, obvious...",1,"[name, thats, first, name, obviously, cant, te...","[my, name, is, jake, thats, my, first, name, o...","[0.050813198, 0.21546867, -0.11217794, 0.05301..."
1,1,2,"""A flying saucer?"" Marco said. He did laugh. T...",1,"[flying, saucer, qstinpunc, laugh, looked, cou...","[a, flying, saucer, qstinpunc, marco, said, he...","[0.0059409393, 0.23846294, -0.09167185, 0.0067..."
2,1,3,<They have come to destroy you.>\nIt was stran...,1,"[anglepunc, come, destroy, anglepunc, strange,...","[anglepunc, they, have, come, to, destroy, you...","[0.00516426, 0.25513598, -0.040639274, -0.0519..."
3,1,4,<Yeerks!>\nThe twin red lights slowed. They tu...,1,"[anglepunc, yeerks, exclmpunc, anglepunc, twin...","[anglepunc, yeerks, exclmpunc, anglepunc, the,...","[-0.027513022, 0.24088845, 0.00715725, -0.0299..."
4,1,5,"The Hork-Bajir pointed his gun, or whatever it...",1,"[hork, odashpunc, bajir, pointed, gun, whateve...","[the, hork, odashpunc, bajir, pointed, his, gu...","[-0.037028935, 0.29484826, 0.030976577, -0.076..."


Load `books_df` and undo array-to-list.

In [3]:
books_df = pd.read_csv('../data/animorphs_books.csv')

for col in ['clean','book_vec']:
    books_df[col] = books_df[col].map(literal_eval) # as lists
    
for col in ['book_vec']:
    books_df[col] = books_df[col].map(np.array) # as array
    
books_df.head()

Unnamed: 0,book,text,authenticity,clean,book_vec
0,1,"My name is Jake. That's my first name, obvious...",1,"[name, thats, first, name, obviously, cant, te...","[0.0010895184, 0.2081184, -0.05931624, 0.01286..."
1,2,My name is Rachel. I won't tell you my last na...,1,"[name, tell, last, name, none, us, ever, tell,...","[-0.00226549, 0.22271992, -0.065148115, 0.0113..."
2,3,My name is Tobias. A freak of nature. One of a...,1,"[name, freak, nature, kind, tell, last, name, ...","[-0.0129256295, 0.21558568, -0.05230921, 0.003..."
3,4,My name is Cassie.\nI can't tell you my last n...,1,"[name, cant, tell, last, name, wish, could, ca...","[-0.0025382496, 0.21817772, -0.069260634, 0.00..."
4,5,My name is Marco.\nI can't tell you my last na...,1,"[name, cant, tell, last, name, live, believe, ...","[-0.00024333985, 0.22410813, -0.054545447, 0.0..."


In [56]:
seed = 1007

In [4]:
cvec = CountVectorizer(max_features=500)

books_cvec = pd.DataFrame(cvec.fit_transform(
    books_df['clean'].map(' '.join)).toarray(),
                          columns=cvec.get_feature_names())

books_cvec.head()

Unnamed: 0,able,across,actually,agreed,ahead,air,alien,alive,almost,alone,...,work,world,would,wrong,yeah,yeerk,yeerks,yelled,yes,yet
0,13,16,11,14,13,27,18,8,28,8,...,7,13,52,13,29,38,38,20,29,10
1,13,11,8,12,2,24,4,7,18,9,...,9,12,89,18,22,42,32,11,30,13
2,14,6,4,5,9,43,5,6,20,4,...,7,16,102,10,23,25,45,9,16,11
3,6,14,9,16,9,28,6,8,23,2,...,3,19,74,9,17,10,31,10,39,9
4,5,5,5,8,13,22,4,13,23,7,...,7,17,94,12,33,42,30,21,33,8


We'll use similar functions to those we used for the W2V SVM to do leave-one-out analysis to maximize the training data available when checking each book.

In [5]:
def cvec_all(df, col, max_features=100):
    cvec = CountVectorizer(max_features=max_features)
    return pd.DataFrame(cvec.fit_transform(df[col].map(' '.join)).toarray(),
                        columns=cvec.get_feature_names())

def tvec_all(df, col, max_features=100):
    tvec = TfidfVectorizer(max_features=max_features)
    return pd.DataFrame(tvec.fit_transform(df[col].map(' '.join)).toarray(),
                        columns=tvec.get_feature_names())

In [6]:
def do_gs(model, params, X, y): # do gridsearch
    # adapted from David at https://stackoverflow.com/questions/34624978
    
    best_score = -1
    best_p = {}
    
    for p in ParameterGrid(params):
        
        model.set_params(**p)
        model.fit(X,y)
        
        if model.score(X,y) > best_score:
            best_score = model.score(X,y)
            best_p = p
            
    model.set_params(**best_p)
    model.fit(X,y)
    
    return model

In [39]:
max_features = 500

In [40]:
def p1o_book(book_out, model, params, books_df=books_df):

    train_inds = books_df['book'] != book_out
    test_inds = books_df['book'] == book_out
    
    vec_df = cvec_all(books_df, 'clean', max_features=max_features)
    
    X_b = vec_df.loc[train_inds]
    y_b = books_df['authenticity'].loc[train_inds]
    
    X_b_test = vec_df.loc[test_inds]

    model = do_gs(model, params, X_b, y_b)
    return model.predict(X_b_test)

In [41]:
def p1o_chap(book_out, model, params, chaps_df=chaps_df): # predict 1 out

    train_inds = chaps_df['book'] != book_out
    test_inds = chaps_df['book'] == book_out
    
    vec_df = cvec_all(chaps_df, 'clean', max_features=max_features)
    
    X_c = vec_df.loc[train_inds]
    y_c = chaps_df['authenticity'].loc[train_inds]
    
    X_c_test = vec_df.loc[test_inds]
    
    model = do_gs(model, params, X_c, y_c)
#     print(f'book {book_out} complete')
    return np.mean(model.predict(X_c_test))

In [57]:
mnb_params = {'alpha':np.logspace(-3,0,4)}
mnb = MultinomialNB()

gnb_params = {}
gnb = GaussianNB()

svm_params = {
    'C':np.logspace(2,4,5),
    'kernel':['rbf']
}
svm = SVC(random_state=seed, gamma='scale',
          max_iter=5e4, probability=True)

In [69]:
new_results = pd.DataFrame({'book':range(1,55)})
new_results['mnb book pred'] = new_results['book'].map(
    lambda b:p1o_book(b, mnb, mnb_params)[0])
new_results['mnb chap pred avg'] = new_results['book'].map(
    lambda b:p1o_chap(b, mnb, mnb_params))
new_results['svm book pred'] = new_results['book'].map(
    lambda b:p1o_book(b, svm, svm_params)[0])
new_results['svm chap pred avg'] = new_results['book'].map(
    lambda b:p1o_chap(b, svm, svm_params))

In [70]:
new_results['authenticity'] = books_df['authenticity']

In [71]:
new_results

Unnamed: 0,book,mnb book pred,mnb chap pred avg,svm book pred,svm chap pred avg,authenticity
0,1,1,0.814815,1,0.962963,1
1,2,1,0.869565,1,0.913043,1
2,3,1,0.888889,1,0.925926,1
3,4,1,0.84,1,0.72,1
4,5,1,0.875,1,0.958333,1
5,6,1,0.84,1,0.88,1
6,7,1,0.777778,1,0.777778,1
7,8,1,0.636364,1,0.681818,1
8,9,1,0.833333,1,0.875,1
9,10,1,0.884615,1,0.884615,1


In [75]:
sum((new_results['svm book pred'] - new_results['authenticity']).map(abs))

2