In [1]:
import numpy as np

In [2]:
def parse_file(path):
    data = []
    with open(path) as infile:
        lines = infile.readlines()
        for line in lines:
            splits = line.split(',')
            data.append([int(splits[0][1:]), splits[1].strip()[1:-1], ' '.join(splits[2:]).strip()[1:-2]])
    return data

In [3]:
pb2_data = parse_file('../data/exam/pb2_data.txt')
pb2_data_full = parse_file('../data/exam/pb2_data_full.txt')

In [4]:
len(pb2_data), len(pb2_data_full)

(4491, 18846)

# ___

In [5]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, 
                        min_df=2, max_features=1000)
features = vectorizer.fit_transform([d[-1] for d in pb2_data_full])

In [11]:
%%time
lda = LDA(n_components=50, random_state=666, learning_method='online', n_jobs=-1)
lda.fit(features)

CPU times: user 1min 37s, sys: 10.1 s, total: 1min 47s
Wall time: 1min 55s


In [18]:
feature_sub = lda.transform(features[[i[0] for i in pb2_data]])

# ___

In [33]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

In [28]:
%%time
gm = GaussianMixture(n_components=20, covariance_type='full').fit(feature_sub)

In [29]:
yhat = gm.predict(feature_sub)

In [30]:
ytrue = [i[0] for i in pb2_data]

In [31]:
homogeneity_score(ytrue, yhat), completeness_score(ytrue, yhat)

(0.32615463520727106, 1.0)

In [41]:
np.argmax(gm.predict_proba(feature_sub[:1]))

3

# ___

In [45]:
def runner(k, t, X):
    print('lda for t=%d' % t)
    lda = LDA(n_components=t, random_state=666, learning_method='online', n_jobs=-1)
    lda.fit(features)
    feature_sub = lda.transform(features[[i[0] for i in pb2_data]])
    print('gm for k=%d' % k)
    gm = GaussianMixture(n_components=k, covariance_type='full').fit(feature_sub)
    return gm.predict(feature_sub)

In [None]:
%%time
_log = {}
for k in [10, 20]:
    for t in [5, 10, 20, 50]:
        if k == 10 and t == 50:
            continue
        _log[k] = {}
        _log[k][t] = {}
        ypred = runner(k, t, features)
        _log[k][t]['preds'] = ypred
        _log[k][t]['score'] = [homogeneity_score(ytrue, ypred), completeness_score(ytrue, ypred), v_measure_score(ytrue, ypred)]
        print()

lda for t=5
gm for k=10

lda for t=10


# ___

In [57]:
np.argmax(np.bincount(ypred))

1