In [1]:
# Load in libraries
import cPickle as pickle
import urllib2
import shutil
from time import time
import os
import random
import io
from __future__ import print_function

# ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

### Load the Data

In [4]:
# http://stackoverflow.com/questions/10058591/how-can-i-open-utf-16-files-on-python-2-x

db = pickle.load(open('db.p', 'rb'))
txts = []
pids = []
n=0
for pid,j in db.iteritems():
  n+=1
  fname = os.path.join('txt', pid) + '.txt'
  if os.path.isfile(fname):
    try:
        txt = io.open(fname, 'r', encoding = 'utf-16').read()
        txts.append(txt)
    except UnicodeError:
        txt = open(fname, 'r').read()
        txts.append(txt)
    pids.append(pid)
  #print 'reading %d/%d' % (n, len(db))

In [None]:
txts[0]

In [52]:
out = {}
out['text'] = txts
out['pids'] = pids

print('writing txt.p')
pickle.dump(out, open("txt.p", "wb"))

writing txt.p


### Compute NMF

In [None]:
n_samples = 2000
n_features = 5000
n_topics = 30
n_top_words = 20

#### First, Compute TF-IDF

In [6]:
print("Extracting tf-idf features for NMF...")

tf_izer = TfidfVectorizer(input='content', 
        encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, 
        analyzer='word', stop_words='english', 
        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
        ngram_range=(1, 2), max_features = 5000, 
        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

t0 = time()
tfidf = tf_izer.fit_transform(txts)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 374.267s.


In [7]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tf_izer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

### Compute LDA

In [13]:
# Use tf (raw term count) features for LDA.
t0 = time()
count_izer = CountVectorizer(input='content',
                             encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, 
                             analyzer='word', stop_words='english',
                             token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
                             ngram_range=(1, 2), max_features = 5000)
cf = count_izer.fit_transform(txts)
print("done in %0.3fs." % (time() - t0))

done in 398.395s.


In [14]:
# Now compute LDA
lda = LatentDirichletAllocation(max_iter=5,
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(cf)
print("done in %0.3fs." % (time() - t0))

done in 173.940s.


In [15]:
print("\nTopics in LDA model:")
tf_feature_names = count_izer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
image pp images based vol recognition using vol pp ieee face proposed used method fig detection processing features analysis different results
Topic #1:
matrix algorithm problem sparse rank convex optimization norm method linear function solution gradient methods matrices data algorithms using non vector
Topic #2:
learning network training neural layer networks model deep al input et et al word words using layers models neural networks language used
Topic #3:
data learning set classification training class feature kernel features classifier number using used based performance methods algorithm dataset test accuracy
Topic #4:
model data models distribution time number using variables parameters bayesian likelihood inference figure set al given information probability used et
Topic #5:
image images using figure time fig camera motion used reconstruction scale noise point filter error pixel data frame method space
Topic #6:
graph clustering points algorithm