In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
from matplotlib.colors import Colormap
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
from matplotlib import cm
from cycler import cycler
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import AffinityPropagation
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from scipy import spatial
import gc
%matplotlib inline

In [2]:
df = pd.read_csv('docs.csv').sample(frac=1, random_state=0) #load dataset and shuffle rows
df.index = range(0, 100) #reindex

In [3]:
#Reserve holdout group
train = df.loc[:74].copy() #train group
test = df.loc[75:].copy() #holdout group

In [4]:
def get_lemmas(document):
    """takes raw spacy parse and returns only
    word lemmas, in or out of vocab.
    """
    result = ''
    for token in document:
        if not token.is_space and not token.is_punct and not (token.lemma_ == '-PRON-'):
            result += token.lemma_ + ' '
        elif token.lemma_ == '-PRON-':
            result += token.orth_ + ' '
    return result

In [5]:
prs = spacy.load('en')
train['raw_parse'] = train.text.apply(prs)
train['lemmas'] = train.raw_parse.apply(get_lemmas)
vec = TfidfVectorizer(stop_words='english', min_df=2, max_df=.99, ngram_range=(1, 3))
svd = TruncatedSVD(n_components=74, random_state=0, algorithm='arpack')
norm = Normalizer(copy=False)
lsa = make_pipeline(svd, norm)
raw_vec_train = vec.fit_transform(train.lemmas)
train_mat = lsa.fit_transform(raw_vec_train)

In [6]:
#Encode the authors and the source texts for cluster evaluation
author_encoding = {}
code = iter(range(0, len(df.author.unique())))
for auth in df.author.unique():
    author_encoding[auth] = next(code)
    
train['author_code'] = train.author.apply(lambda x: author_encoding[x])
test['author_code'] = test.author.apply(lambda x: author_encoding[x])

title_encoding = {}
code = iter(range(0, len(df.title.unique())))
for tit in df.title.unique():
    title_encoding[tit] = next(code)
    
train['title_code'] = train.title.apply(lambda x: title_encoding[x])
test['title_code'] = test.title.apply(lambda x: title_encoding[x])

In [7]:
trans = vec.fit_transform(train.lemmas)
train_mat = lsa.fit_transform(trans)
ap = AffinityPropagation()
np.unique(ap.fit_predict(train_mat))

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18])

In [8]:
train['ap_cluster'] = ap.predict(train_mat) #record cluster assignments for training group
clusters = {}
for clust in train.ap_cluster.unique():
    clusters[clust] = (
        list(train[train.ap_cluster==clust].author.unique()), 
        list(train[train.ap_cluster==clust].title.unique()),
    )

authors = {}
for auth in train.author.unique():
    authors[auth] = train[train.author==auth].ap_cluster.unique()
    
#for clust in range(0, len(clusters)):
#    print('cluster ', clust, 'authors: ', clusters[clust][0], 'source texts: ', clusters[clust][1])

In [9]:
cluster_frame = pd.DataFrame(index=[clust for clust in clusters], columns=['number', 'author', 'titles', 'center'])
cluster_frame['number'] = cluster_frame.index
for clust in clusters:
    cluster_frame.at[clust, 'author'] = clusters[clust][0][0]
    cluster_frame.at[clust, 'titles'] = clusters[clust][1]
    cluster_frame.at[clust, 'center'] = ap.cluster_centers_[clust]
cf = cluster_frame.copy()

In [10]:
cf = cf.sort_values('number')
cf

Unnamed: 0,number,author,titles,center
0,0,dick,[variableman],"[0.284126030993, -0.125934183839, -0.039002128..."
1,1,reynolds,[spaceman_spree],"[0.272280459418, -0.0409121095981, -0.02227118..."
2,2,reynolds,[off_course],"[0.292357811895, -0.0209455648265, -0.00598433..."
3,3,asimov,[nuclear_energy_3],"[0.142144408993, -0.342149389719, -0.203688871..."
4,4,dick,[mrspaceship],"[0.304801510613, 0.04299048711, 0.001864242480..."
5,5,herbert,[haystack],"[0.351572381815, 0.648848397327, -0.2115489725..."
6,6,asimov,[youth],"[0.307579761401, 0.0531959177178, -0.030131251..."
7,7,rockwell,"[space_pirates, venus_revolt]","[0.352480631284, -0.00555398358782, 0.68154701..."
8,8,verne,[20000leagues],"[0.239652523079, -0.152336798533, -0.042455368..."
9,9,pohl,[skysearch],"[0.288039319237, 0.0217771658353, -0.070395180..."


## Cluster Similarity
We can use cosine similarity to find similarity between cluster centers.

In [11]:
def get_similarity_matrix(cf):
    """Computes a similarity matrix between all cluster centers.
    The returned matrix axes are ordered by the numeric values
    of the clusters.
    """
    result = np.matrix(np.zeros((len(cf), len(cf))))
    for i in range(0, len(cf)):
        for j in range(0, len(cf)):
            similarity = 1 - spatial.distance.cosine(cf.loc[i].center, cf.loc[j].center)
            result[i, j] = similarity
    return result

def get_most_similar(cluster, sim, get_similarity=False):
    """Takes a cluster number and the cluster frame and
    computes the most similar cluster in terms of maximum
    cosine between cluster centers.
    """
    row = np.array(sim[cluster, :])
    result = np.argsort(row)[0][1]
    if get_similarity:
        return result, row[:, result][0]
    else:
        return result

def get_similarity(c1, c2, sim):
    """Takes a cluster number and the cluster frame and
    computes the most similar cluster in terms of maximum
    cosine between cluster centers.
    """
    return sim[c1, c2]

sim = get_similarity_matrix(cf) #compute similarity matrix

When I look at similarity in this way, we can get some interesting snippets of information.

In [12]:
most_similar = []
for clust in cf.index:
    most_similar.append(get_most_similar(clust, sim, get_similarity=True)[1])
mean_sim = np.mean(most_similar)
print('Mean cluster similarity to most similar cluster: ', mean_sim)
print('\n')
print('Cluster with least similarity to any others: \n')
print(cf.loc[np.argmin(most_similar)][['author', 'titles']])

Mean cluster similarity to most similar cluster:  0.0313463122032


Cluster with least similarity to any others: 

author                asimov
titles    [nuclear_energy_3]
Name: 3, dtype: object


The mean similarity to othe nearest cluster, of course, has little meaning with no other values to compare it to. It does, however, give us a benchmark for other similarity values.<br>
I can use this similarity to investigate some of the problems I had. The biggest problem was a misassignment: An excerpt from Vonnegut was assigned to a cluster belonging to Wells.

In [13]:
def get_similar_authors(author, cf, sim):
    """Takes an author name, the cluster dataframe, and similarity matrix
    and returns a list of the most similar authors (can be one or many
    because of multiple clusters per author.)
    """
    author_clusters = cf[cf.author==author].number.unique()
    similar_clusters = []
    for clust in author_clusters:
        similar_clusters.append(get_most_similar(clust, sim))
    similar_clusters = list(set(similar_clusters))
    similar_authors = []
    for clust in similar_clusters:
        similar_authors.append(cf.loc[clust].author)
    return set(similar_authors)

def get_similar_document(title, cf, sim):
    """Takes a document title, the cluster dataframe, and similarity matrix
    and returns a list of the most similar documents (can be one or many
    because of multiple clusters per title.)
    """
    title_clusters = []
    for i in cf.index:
        if title in cf.loc[i].titles:
            title_clusters.append(i)
    similar_clusters = []
    for clust in title_clusters:
        similar_clusters.append(get_most_similar(clust, sim))
    similar_clusters = list(set(similar_clusters))
    similar_titles = []
    for clust in similar_clusters:
        similar_titles += [x for x in cf.loc[clust].titles]
    return set(similar_titles)


In [14]:
print('Similar authors to Wells: ')
print(get_similar_authors('wells', cf, sim))
print('Similarity between Vonnegut cluster and Disovery of the Future by Wells cluster: ', get_similarity(13, 16, sim))
print('Similarity scaled by mean maximum similarity between clusters: ', get_similarity(13, 16, sim)/mean_sim)

Similar authors to Wells: 
{'vonnegut', 'herbert'}
Similarity between Vonnegut cluster and Disovery of the Future by Wells cluster:  0.0552353680766
Similarity scaled by mean maximum similarity between clusters:  1.7621009999


There is a high similarity between Wells' Discovery of the Future cluster and Vonnegut's only cluster, which gives us some insight into why the misassignment occured.<br><br>
With these tools, we can "most similar" authors and "most similar" titles.

In [15]:
for auth in authors:
    print('Most similar authors to {}: '.format(auth))
    print(get_similar_authors(auth, cf, sim))
    
print('\n')
for title in df.title.unique():
    print('Most similar titles to {}: '.format(title))
    print(get_similar_document(title, cf, sim))

Most similar authors to dick: 
{'vonnegut', 'pohl'}
Most similar authors to vonnegut: 
{'verne'}
Most similar authors to asimov: 
{'dick', 'reynolds'}
Most similar authors to reynolds: 
{'vonnegut', 'verne'}
Most similar authors to verne: 
{'herbert', 'asimov'}
Most similar authors to wells: 
{'vonnegut', 'herbert'}
Most similar authors to bradbury: 
{'vonnegut', 'verne', 'herbert'}
Most similar authors to herbert: 
{'verne'}
Most similar authors to pohl: 
{'verne'}
Most similar authors to rockwell: 
{'pohl'}


Most similar titles to variableman: 
{'skysearch'}
Most similar titles to trip_up_yonder: 
{'20000leagues'}
Most similar titles to youth: 
{'variableman'}
Most similar titles to spaceman_spree: 
{'moon_journey'}
Most similar titles to 20000leagues: 
{'old_rambling_house'}
Most similar titles to discovery_future: 
{'haystack'}
Most similar titles to futuria: 
{'trip_up_yonder', '2BR02B', 'moon_journey'}
Most similar titles to off_course: 
{'trip_up_yonder', '2BR02B'}
Most similar

Looking at these results, we can see that many of the "most similar" cluster pairs do not belong to the same author. This is a bad sign for our cluster analysis' ability to pick up on differences between authors; it seems to do a much better job of identifying source texts than authors.