## Progress Report 2
With some inspiration from <a href='http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py'>this sklearn text clustering tutorial</a>, I tried four different clustering algorithms and tested their performance in clustering by author using the adjusted rand index, as well as homogeniety, completeness, and V-score (harmonic of homogeniety and completeness). Affinity propogation outperforms all the others and ends up grouping almost perfectly by the source texts (each chapter comes from one of two works by each author.)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MeanShift, SpectralClustering, AffinityPropagation
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import gc
import gensim

In [2]:
df = pd.read_csv('docs.csv').sample(frac=1, random_state=0) #shuffle rows
df.index = range(0, 100) #reindex

In [3]:
#Encode the authors and the source texts

author_encoding = {}
code = iter(range(0, len(df.author.unique())))
for auth in df.author.unique():
    author_encoding[auth] = next(code)
    
df['author_code'] = df.author.apply(lambda x: author_encoding[x])

title_encoding = {}
code = iter(range(0, len(df.title.unique())))
for tit in df.title.unique():
    title_encoding[tit] = next(code)
    
df['title_code'] = df.title.apply(lambda x: title_encoding[x])

In [4]:
train = df.loc[:74].copy() #train group
test = df.loc[75:].copy() #holdout group
len(test)

25

## Parse and process text as before

In [5]:
prs = spacy.load('en')

In [6]:
train['raw_parse'] = train.text.apply(prs)

In [7]:
def get_lemmas(document):
    """takes raw spacy parse and returns only
    word lemmas, in or out of vocab.
    """
    result = ''
    for token in document:
        if not token.is_space and not token.is_punct and not (token.lemma_ == '-PRON-'):
            result += token.lemma_ + ' '
        elif token.lemma_ == '-PRON-':
            result += token.orth_ + ' '
    return result

In [8]:
train['lemmas'] = train.raw_parse.apply(get_lemmas)

## Add normalization to LSA

In [9]:
vec = TfidfVectorizer(stop_words='english', min_df=2, max_df=.99, ngram_range=(1, 3))
svd = TruncatedSVD(n_components=71, random_state=0, algorithm='arpack')
norm = Normalizer(copy=False)
lsa = make_pipeline(svd, norm)
trans = vec.fit_transform(train.lemmas)
train_mat = lsa.fit_transform(trans)

## KMeans, Mean Shift, Spectral Clustering, and Affinity Propagation

In [48]:
#KMeans gets best ARI at 19 clusters
km = KMeans(random_state=0, n_clusters=19)
np.unique(km.fit_predict(train_mat))
print('number of clusters: ', len(km.cluster_centers_))
labels = train.author_code
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, km.labels_, sample_size=1000))

number of clusters:  19
Homogeneity: 0.975
Completeness: 0.760
V-measure: 0.854
Adjusted Rand-Index: 0.583
Silhouette Coefficient: 0.172


In [77]:
#Mean shift does best at bandwidth=1.025, but not nearly as well as kmeans
ms = MeanShift(bandwidth=1.025)
np.unique(ms.fit_predict(train_mat))
print('number of clusters: ', len(ms.cluster_centers_))
labels = train.author_code
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, ms.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, ms.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, ms.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, ms.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, ms.labels_, sample_size=-1))

number of clusters:  50
Homogeneity: 1.000
Completeness: 0.605
V-measure: 0.754
Adjusted Rand-Index: 0.272
Silhouette Coefficient: 0.128


In [83]:
#Spectral clustering does best with 20 clusters, suggesting grouping by 20 unique source texts.
sc = SpectralClustering(n_clusters=20)
np.unique(sc.fit_predict(train_mat))
print('number of clusters: ', n_clusters)
labels = train.author_code
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, sc.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, sc.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, sc.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, sc.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, sc.labels_, sample_size=-1))

number of clusters:  10
Homogeneity: 0.941
Completeness: 0.728
V-measure: 0.821
Adjusted Rand-Index: 0.520
Silhouette Coefficient: 0.193


In [84]:
#Sure enough, we see high correlation with the source texts.
labels = train.title_code
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, sc.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, sc.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, sc.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, sc.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, sc.labels_, sample_size=-1))

Homogeneity: 0.939
Completeness: 0.937
V-measure: 0.938
Adjusted Rand-Index: 0.804
Silhouette Coefficient: 0.193


In [13]:
#Finally, affinity propogation "finds" 19 clusters without any supervision.
ap = AffinityPropagation()
np.unique(ap.fit_predict(train_mat))
print('number of clusters: ', len(ap.cluster_centers_))
labels = train.author_code
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, ap.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, ap.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, ap.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, ap.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, ap.labels_, sample_size=-1))

number of clusters:  19
Homogeneity: 0.989
Completeness: 0.768
V-measure: 0.865
Adjusted Rand-Index: 0.592
Silhouette Coefficient: 0.187


In [14]:
#The 19 clusters line up really well with the 20 source texts, with the highest ARI we've seen yet.
labels = train.title_code
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, ap.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, ap.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, ap.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, ap.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, ap.labels_, sample_size=-1))

Homogeneity: 0.967
Completeness: 0.970
V-measure: 0.968
Adjusted Rand-Index: 0.901
Silhouette Coefficient: 0.189


In [86]:
#Improve performance further by adjusting LSA hyperparameters.
svd.n_components = 74
svd.algorithm = 'arpack'
trans = vec.fit_transform(train.lemmas)
train_mat = lsa.fit_transform(trans)
ap = AffinityPropagation()
np.unique(ap.fit_predict(train_mat))
labels = train.title_code
print('number of clusters: ', len(ap.cluster_centers_))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, ap.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, ap.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, ap.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, ap.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(train_mat, ap.labels_, sample_size=-1))

number of clusters:  19
Homogeneity: 0.976
Completeness: 0.981
V-measure: 0.978
Adjusted Rand-Index: 0.927
Silhouette Coefficient: 0.182
