In [1]:
# Load in libraries
import cPickle as pickle
import urllib2
import shutil
from time import time
import os
import random
import io
from __future__ import print_function

# Data libraries
import pandas as pd
from pandas import DataFrame

# ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

### Load the Data (raw)

In [4]:
# http://stackoverflow.com/questions/10058591/how-can-i-open-utf-16-files-on-python-2-x

db = pickle.load(open('db.p', 'rb'))
txts = []
pids = []
n=0
for pid,j in db.iteritems():
  n+=1
  fname = os.path.join('txt', pid) + '.txt'
  if os.path.isfile(fname):
    try:
        txt = io.open(fname, 'r', encoding = 'utf-16').read()
        txts.append(txt)
    except UnicodeError:
        txt = open(fname, 'r').read()
        txts.append(txt)
    pids.append(pid)
  #print 'reading %d/%d' % (n, len(db))

In [None]:
txts[0]

In [52]:
out = {}
out['text'] = txts
out['pids'] = pids

print('writing txt.p')
pickle.dump(out, open("txt.p", "wb"))

writing txt.p


### Load the Data (already processed)

In [2]:
txt_db = pickle.load(open('txt.p', 'rb'))

In [3]:
txts = txt_db['text']
pids = txt_db['pids']

### Compute LDA

In [4]:
# Use tf (raw term count) features for LDA.
t0 = time()
tf = CountVectorizer(input='content',
                             encoding='utf-8', decode_error='replace', strip_accents='unicode', lowercase=True, 
                             analyzer='word', stop_words='english',
                             token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
                             ngram_range=(1, 2), max_features = 5000)
cf = tf.fit_transform(txts)
print("done in %0.3fs." % (time() - t0))
# completes in 

KeyboardInterrupt: 

In [15]:
# Now compute LDA
lda = LatentDirichletAllocation(n_topics = 30,
                                max_iter=10,
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
# done in 703.415s (12 minutes)

done in 703.415s.


In [18]:
print("\nTopics in LDA model:")
tf_feature_names = cv.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words = 30)


Topics in LDA model:
Topic #0:
pp vol vol pp signal ieee fig proposed performance based noise using algorithm processing ii information signals iii data trans ieee trans mean iv vector proc signal processing transactions channel rate sensing ieee transactions
Topic #1:
sparse norm matrix l1 sparsity rank problem low lasso recovery low rank l2 et dictionary al et al convex min solution robust regularization model group non subspace optimization minimization completion following zero
Topic #2:
word words language text model document al speech et sentence et al documents semantic models corpus using sentences sequence based used vector table use context training character set lstm vectors results
Topic #3:
xt online wt gradient stochastic algorithm yt ft learning zt update xt xt function gp algorithms batch using rate step descent optimization ht tt dt ut gt time sgd loss mt
Topic #4:
features feature data selection number set methods method test used using performance results al selecte

### Save the Topic Model

* For each word, what is its score per topic
* For each document, what is its score per topic
* Which topic does each document belong to

In [57]:
# Create a dataframe with topic weights for each word
word_scores = pd.DataFrame(columns=['Word','Topic', 'Weight'])
for topic_idx, topic in enumerate(lda.components_):
    pd_to_add = pd.DataFrame( {'Word': tf_feature_names, 
                               'Topic': [str(topic_idx)] * len(tf_feature_names),
                               'Weight': topic} )
    word_scores = pd.concat([word_scores, pd_to_add])

In [59]:
# Create a dataframe with docs to words. Use pids
doc_to_word = pd.DataFrame(columns=['PID','Word', 'Count'])



In [78]:
doc_scratch = DataFrame(cf.A, columns=cv.get_feature_names())
doc_scratch = doc_scratch.unstack()
doc_scratch = doc_scratch.reset_index(name='value')
doc_scratch.rename(columns={'level_0': 'Word', 'level_1': 'PidIndex', 'value': 'WordCount'}, inplace=True)
doc_scratch = doc_scratch[doc_scratch['WordCount'] > 0]

# doc_scratch.head()

In [79]:
doc_scratch

Unnamed: 0,Word,PidIndex,WordCount
19,a0,19,5
70,a0,70,1
74,a0,74,3
106,a0,106,2
126,a0,126,3
154,a0,154,1
168,a0,168,3
192,a0,192,1
204,a0,204,5
205,a0,205,2


In [80]:
doc_weights = pd.merge(doc_scratch, word_scores, how='inner', on='Word')

KeyboardInterrupt: 

In [None]:
doc_weights

In [58]:
pids = txt_db['pids']
pd.merge(word_scores, )
[feature_names[i]
 cf = tf.fit_transform(txts)

Unnamed: 0,Topic,Weight,Word
0,0,14.357561,a0
1,0,118.602701,a1
2,0,83.979146,a1 a2
3,0,188.381372,a2
4,0,36.414756,a3
5,0,12.817502,aa
6,0,0.033335,aaai
7,0,168.789133,ab
8,0,0.033334,abc
9,0,139.746212,ability


In [51]:
cv.get_feature_names()[30]



u'accuracies'

In [52]:
word_scores.ix[30]

Word       accuracies
Topic0        3.25047
Topic1       0.075337
Topic2        107.479
Topic3      0.0383341
Topic4        617.372
Topic5      0.0478734
Topic6      0.0333353
Topic7      0.0333339
Topic8        5.75345
Topic9      0.0333448
Topic10        283.35
Topic11       814.822
Topic12       535.882
Topic13     0.0333589
Topic14      0.211527
Topic15     0.0337312
Topic16       400.844
Topic17       81.9922
Topic18     0.0341962
Topic19      0.158811
Topic20     0.0339305
Topic21       292.875
Topic22     0.0334796
Topic23       198.575
Topic24     0.0333528
Topic25     0.0333464
Topic26     0.0342419
Topic27     0.0333338
Topic28       8.76951
Topic29       140.629
Name: 30, dtype: object

In [None]:
# Save tf
out = {}
out['word'] = txts
out['counts'] = pids

print('writing count_vectors.p')
pickle.dump(out, open("count_vectors.p", "wb"))

In [None]:
# Save LDA
lda

out = {}
out['word'] = txts
out['counts'] = pids

print('writing count_vectors.p')
pickle.dump(out, open("count_vectors.p", "wb"))