In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import AffinityPropagation
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.metrics import accuracy_score
import xgboost as xgb
from scipy import spatial
import gc

In [2]:
df = pd.read_csv('docs.csv').sample(frac=1, random_state=0) #load dataset and shuffle rows
df.index = range(0, 100) #reindex

In [3]:
#Reserve holdout group
train = df.loc[:74].copy() #train group
test = df.loc[75:].copy() #holdout group

In [4]:
def get_lemmas(document):
    """takes raw spacy parse and returns only
    word lemmas, in or out of vocab.
    """
    result = ''
    for token in document:
        if not token.is_space and not token.is_punct and not (token.lemma_ == '-PRON-'):
            result += token.lemma_ + ' '
        elif token.lemma_ == '-PRON-':
            result += token.orth_ + ' '
    return result

In [5]:
prs = spacy.load('en')
train['raw_parse'] = train.text.apply(prs)
train['lemmas'] = train.raw_parse.apply(get_lemmas)
vec = TfidfVectorizer(stop_words='english', min_df=2, max_df=.99, ngram_range=(1, 3))
svd = TruncatedSVD(n_components=74, random_state=0, algorithm='arpack')
norm = Normalizer(copy=False)
lsa = make_pipeline(svd, norm)
raw_vec_train = vec.fit_transform(train.lemmas)
train_mat = lsa.fit_transform(raw_vec_train)

In [55]:
#Encode the authors and the source texts for cluster evaluation
author_encoding = {}
code = iter(range(0, len(df.author.unique())))
for auth in df.author.unique():
    author_encoding[auth] = next(code)
    
inverted_authcode = dict([[v,k] for k,v in author_encoding.items()])
    
train['author_code'] = train.author.apply(lambda x: author_encoding[x])
test['author_code'] = test.author.apply(lambda x: author_encoding[x])

title_encoding = {}
code = iter(range(0, len(df.title.unique())))
for tit in df.title.unique():
    title_encoding[tit] = next(code)
    
inverted_titcode = dict([[v,k] for k,v in title_encoding.items()])
    
train['title_code'] = train.title.apply(lambda x: title_encoding[x])
test['title_code'] = test.title.apply(lambda x: title_encoding[x])

In [7]:
trans = vec.fit_transform(train.lemmas)
train_mat = lsa.fit_transform(trans)
X = train_mat

In [12]:
test['raw_parse'] = test.text.apply(prs)
test['lemmas'] = test.raw_parse.apply(get_lemmas)
trans = vec.transform(test.lemmas)
test_mat = lsa.transform(trans)

In [39]:
dtrain = xgb.DMatrix(train_mat, label=train.author_code)
dtest = xgb.DMatrix(test_mat, label=test.author_code)

param = {
    'max_depth': 7, 
    'eta': .001, 
    'silent': 1, 
    'objective': 'multi:softmax',
    'nthread':4,
    'eval_metric':'mlogloss',
    'num_class': 10,
    'colsample_bytree': .4
    
}

plst = param.items()
num_round = 11670
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(plst, dtrain, num_round, evallist, verbose_eval=False)

In [44]:
predictions = bst.predict(dtest)
accuracy_score(predictions, test.author_code)

0.92000000000000004

In [70]:
test['predicted'] = predictions.astype(int)
test['predicted_author'] = test.predicted.apply(lambda x: inverted_authcode[x])

In [71]:
test[test.predicted!=test.author_code]

Unnamed: 0,text,title,author,date,fiction,length,author_code,title_code,raw_parse,lemmas,predicted,predicted_author
78,\nA SHIFTING REEF\n\nThe year 1866 was signali...,20000leagues,verne,1870,1,1146,4,4,"(\n, A, SHIFTING, REEF, \n\n, The, year, 1866,...",a shifting reef the year 1866 be signalise by ...,5,wells
93,\nEverything was perfectly swell.\n\nThere wer...,2BR02B,vonnegut,1962,1,1152,1,19,"(\n, Everything, was, perfectly, swell, ., \n\...",everything be perfectly swell there be no pris...,7,herbert


Interesting that the gradient boosted classifier mislabels the same example as the cluster analysis, but with a different author.