In [1]:
import numpy as np

from __future__ import division
import numpy as np
import itertools, operator
import scipy.stats

In [2]:
#
def parse_file(path):
    data = []
    with open(path) as infile:
        lines = infile.readlines()
        for line in lines:
            splits = line.split(',')
            data.append([int(splits[0][1:]), splits[1].strip()[1:-1], ' '.join(splits[2:]).strip()[1:-2]])
    return data

pb2_data = parse_file('../data/exam/pb2_data.txt')
pb2_data_full = parse_file('../data/exam/pb2_data_full.txt')

pb2_topics = np.unique([i[1] for i in pb2_data])
topic_map = {v: k for k, v in enumerate(pb2_topics)}

for d in pb2_data:
    d.append(topic_map[d[1]])

ytrue = np.array([i[3] for i in pb2_data])

In [6]:
#
def group_by_label(l):
    it = itertools.groupby(l, operator.itemgetter(1))
    counts = []
    for key, subiter in it:
        counts.append(sum(item[0] for item in subiter))
    return counts

def compute_homogeneity(preds, labels):
    cluster_label_counts = []
    for pred in preds.transpose():
        cluster_label_counts.append(group_by_label([(p,label) for p,label in zip(pred,labels)]))
    
    entropys = []
    for cluster_label_count in cluster_label_counts:
        entropys.append(scipy.stats.entropy(cluster_label_count))
         
    return np.mean(entropys)

def compute_completeness(preds, labels, num_clusters, num_labels):
    label_cluster_counts = {label:np.zeros(num_clusters) for label in range(num_labels)}
    
    for pred, label in zip(preds, labels):
        label_cluster_counts[label] = np.sum([label_cluster_counts[label], pred], axis=0)
    
    entropys = []
    for label_cluster_count in label_cluster_counts.values():
        entropys.append(scipy.stats.entropy(label_cluster_count))
          
    return np.mean(entropys)


def v_measure(preds, labels, num_clusters, num_labels):
    if len(labels) == 0:
        return 1.0, 1.0, 1.0
      
    homogeneity = compute_homogeneity(preds, labels)
    completeness = compute_completeness(preds, labels, num_clusters, num_labels)
    
    if homogeneity==0.0 and completeness==0.0:
        return 0.0, 0.0, 0.0
    v_measure_score = (2.0 * homogeneity * completeness /
                   (homogeneity + completeness))
      
    return homogeneity, completeness, v_measure_score

# ___

In [7]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, 
                        min_df=2, max_features=1000)
features = vectorizer.fit_transform([d[-1] for d in pb2_data_full])

In [100]:
%%time
lda = LDA(n_components=50, random_state=666, learning_method='online', n_jobs=-1)
lda.fit(features)

CPU times: user 1min 41s, sys: 9.82 s, total: 1min 50s
Wall time: 1min 54s


# ___

In [9]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

In [121]:
feature_sub = lda.transform(features[[i[0] for i in pb2_data]])

In [122]:
gm = GaussianMixture(n_components=20, covariance_type='full').fit(feature_sub)

In [123]:
yhat = gm.predict_proba(feature_sub)

In [130]:
v_measure(yhat, ytrue, 20, 20)

(2.4229688422600804, 2.2604688931316064, 2.338899759707366)

# ___

In [12]:
#
%%time
_log = {}
for t in [5, 10, 20, 50]:
    if t not in _log: _log[t] = {}
    print('fitting lda for t=%d' % t)
    lda = LDA(n_components=t, random_state=666, learning_method='online', n_jobs=-1)
    lda.fit(features)
    feature_sub = lda.transform(features[[i[0] for i in pb2_data]])
    for k in [10, 20]:
        if k == 10 and t == 50:
            continue
        if k not in _log[t]: _log[t][k] = {}
        print('gm for k=%d' % k)
        gm = GaussianMixture(n_components=k, covariance_type='full').fit(feature_sub)
        ypred = gm.predict_proba(feature_sub)
        _log[t][k]['preds'] = ypred
        _log[t][k]['score'] = v_measure(ypred, ytrue, k, 20)
        print()

fitting lda for t=5
gm for k=10

gm for k=20

fitting lda for t=10
gm for k=10

gm for k=20

fitting lda for t=20
gm for k=10

gm for k=20

fitting lda for t=50
gm for k=20

CPU times: user 2min 26s, sys: 12.1 s, total: 2min 38s
Wall time: 6min 27s


In [19]:
for t in _log:
    for k in _log[t]:
        print("For k=%d, t=%d; scores =>" % (k, t), _log[t][k]['score'])

For k=10, t=5; scores => (2.2508772199057216, 1.452443332602814, 1.7655893213700347)
For k=20, t=5; scores => (2.149050214457521, 2.021878199290378, 2.083525463285308)
For k=10, t=10; scores => (2.3490649477850067, 1.6331981971545126, 1.926788109217026)
For k=20, t=10; scores => (2.2120924175451955, 2.0636307314306075, 2.1352841306876327)
For k=10, t=20; scores => (2.469589982940709, 1.6879094517421909, 2.0052651068857656)
For k=20, t=20; scores => (2.2897352572189655, 2.16601332968742, 2.2261566117826983)
For k=20, t=50; scores => (2.3345403962827347, 2.255390308879039, 2.2942829091260255)


For k=10 and t=5, we get the best v_measures.

# Using k=10 and t=5

In [20]:
%%time
lda = LDA(n_components=5, random_state=666, learning_method='online', n_jobs=-1)
lda.fit(features)
feature_sub = lda.transform(features[[i[0] for i in pb2_data]])
gm = GaussianMixture(n_components=10, covariance_type='full').fit(feature_sub)
yhat = gm.predict_proba(feature_sub)

CPU times: user 6.35 s, sys: 370 ms, total: 6.72 s
Wall time: 1min 24s


In [29]:
_biggest = np.argmax(np.sum(yhat, axis=0))
print("Biggest cluster: %d with total %f memeberships" % (_biggest, np.sum(yhat, axis=0)[_biggest]))

Biggest cluster: 6 with total 1200.929314 memeberships


Biggest soft cluster is cluster 6

In [30]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [31]:
print_top_words(lda, vectorizer.get_feature_names(), 5)

Topic #0: ax max g9v b8f stephanopoulos
Topic #1: like know just don use
Topic #2: 10 20 12 15 00
Topic #3: people don god think just
Topic #4: edu file space use information

