Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
Merge 3f938c7 into a6c55c2
Browse files Browse the repository at this point in the history
  • Loading branch information
msarahan committed Oct 9, 2015
2 parents a6c55c2 + 3f938c7 commit 13c08fd
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 209 deletions.
8 changes: 0 additions & 8 deletions conda.recipe/meta.yaml
Expand Up @@ -30,10 +30,6 @@ requirements:
- gensim
- pattern
- textblob
- r
- r-ldavis
- r-matrix
- r-data.table
- click
- solrpy
- elasticsearch
Expand All @@ -47,10 +43,6 @@ requirements:
- gensim
- pattern
- textblob
- r
- r-ldavis
- r-matrix
- r-data.table
- click
- solrpy
- elasticsearch
Expand Down
66 changes: 11 additions & 55 deletions topik/models/lda.py
Expand Up @@ -65,72 +65,28 @@ def get_top_words(self, topn):
def get_model_name_with_parameters(self):
return "LDA_{}_topics{}".format(self._model.num_topics, self._corpus.filter_string)

def _get_term_data(self):
term_doc_count_df = pd.DataFrame.from_records([{'tokenid': tokenid, 'token': token,
'doc_count': self._corpus._dict.dfs.get(tokenid, 0)}
for tokenid, token in self._corpus._dict.id2token.items()], index="tokenid")

def _get_topic_term_dists(self):
term_topic_df = pd.DataFrame([
pd.DataFrame.from_records(self._model.show_topic(topic_no, None),
columns=['topic' + str(topic_no) + 'dist', 'token'],
index='token')['topic' + str(topic_no) + 'dist']
for topic_no in range(self._model.num_topics)]).T

token2id_df = pd.DataFrame(self._corpus._dict.token2id.items())
token2id_df = token2id_df.set_index(0)
term_topic_df = pd.concat([term_topic_df, token2id_df], axis=1)
term_topic_df = term_topic_df.set_index(1)

term_data_df = pd.concat([term_doc_count_df, term_topic_df], axis=1)
term_data_df.index.name = 'token_id'
return term_data_df

def _get_vocab(self):
return self._corpus._dict.values()

def _get_term_frequency(self):
self._corpus._dict.save_as_text(os.path.join(test_data_path, 'dictionary'),
sort_by_word=False)
# TODO: see gensim source to see how it's saving this to file, then use that

df = pd.read_csv(os.path.join(test_data_path, 'dictionary'), sep='\t',
index_col=0, header=None)
df = df.sort_index()
return df[2]

def _get_topic_term_dists(self):
term_topic_df = pd.DataFrame()
for topic_no in range(self._model.num_topics):
topic_df = pd.DataFrame(self._model.show_topic(topic_no, None))
topic_df = topic_df.set_index(1)
topic_df.columns = ['topic' + str(topic_no)]
term_topic_df = pd.concat([term_topic_df, topic_df], axis=1)
term_topic_df = term_topic_df.T
term_topic_df.columns.name = 'terms'
term_topic_df.index.name = 'topics'
term_topic_df['term_id'] = pd.Series(dict(self._corpus._dict.token2id.items()))
term_topic_df = term_topic_df.set_index('term_id')
return term_topic_df

def _get_doc_data(self):
doc_data_df = self._get_doc_topic_dists()
doc_data_df['doc_length'] = self._get_doc_lengths()
return doc_data_df

def _get_doc_topic_dists(self):
id_index, bow_corpus = zip(*[(id, self._corpus._dict.doc2bow(doc_tokens))
for id, doc_tokens in self._corpus._corpus])

doc_topic_dists = list(self._model[bow_corpus])
doc_topic = list(self._model[bow_corpus])

for i, doc in enumerate(doc_topic_dists):
for i, doc in enumerate(doc_topic):
for j, topic in enumerate(doc):
doc_topic_dists[i][j] = doc_topic_dists[i][j][1]

doc_topic_dists_df = pd.DataFrame(doc_topic_dists, index=id_index)
doc_topic_dists_df.columns = ['topic'+str(i)+'dist' for i in range(
doc_topic_dists_df.shape[1])]
return doc_topic_dists_df
doc_topic[i][j] = doc_topic[i][j][1]

def _get_doc_lengths(self):
id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
self._corpus._corpus)])
return pd.Series(doc_lengths, index=id_index)
doc_topic_df = pd.DataFrame(doc_topic, index=id_index)
doc_topic_df.columns = ['topic'+str(i)+'dist' for i in range(
doc_topic_df.shape[1])]
doc_topic_df.index.name = 'doc_id'
return doc_topic_df
43 changes: 26 additions & 17 deletions topik/models/model_base.py
@@ -1,4 +1,5 @@
from abc import ABCMeta, abstractmethod
from collections import Counter
import logging

import pandas as pd
Expand Down Expand Up @@ -77,41 +78,49 @@ def get_model_name_with_parameters(self):
"""Abstract method. Primarily internal function, used to name configurations in persisted metadata for later retrieval."""
raise NotImplementedError

@abstractmethod
def _get_term_data(self):
raise NotImplementedError
vocab = self._get_vocab()
tf = self._get_term_frequency()
ttd = self._get_topic_term_dists()
term_data_df = ttd
term_data_df['term_frequency'] = tf
term_data_df['term'] = vocab
return term_data_df

@abstractmethod
def _get_vocab(self):
raise NotImplementedError
return pd.Series(dict(self._corpus._dict.items()))

@abstractmethod
def _get_term_frequency(self):
raise NotImplementedError

@abstractmethod
def _get_topic_term_dists(self):
raise NotImplementedError
tf = Counter()
[tf.update(dict(doc)) for doc in self._corpus]
# TODO update term documents in intermediate store
return pd.Series(dict(tf))

@abstractmethod
def _get_doc_data(self):
raise NotImplementedError
doc_data_df = self._get_doc_topic_dists()
doc_data_df['doc_length'] = self._get_doc_lengths()
return doc_data_df

def _get_doc_lengths(self):
id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
self._corpus._corpus)])
return pd.Series(doc_lengths, index=id_index)

@abstractmethod
def _get_doc_topic_dists(self):
def _get_topic_term_dists(self):
raise NotImplementedError

@abstractmethod
def _get_doc_lengths(self):
def _get_doc_topic_dists(self):
raise NotImplementedError

def to_py_lda_vis(self):
doc_data_df = self._get_doc_data()
term_data_df = self._get_term_data()

model_lda_vis_data = { 'vocab': term_data_df['token'],
'term_frequency': term_data_df['doc_count'],
'topic_term_dists': term_data_df.iloc[:,2:].T,
model_lda_vis_data = { 'vocab': term_data_df['term'],
'term_frequency': term_data_df['term_frequency'],
'topic_term_dists': term_data_df.iloc[:,:-2].T,
'doc_topic_dists': doc_data_df.iloc[:,:-1],
'doc_lengths': doc_data_df['doc_length']}
return model_lda_vis_data
Expand Down
28 changes: 11 additions & 17 deletions topik/models/plsa.py
Expand Up @@ -6,6 +6,7 @@
import random

import numpy as np
import pandas as pd

from .model_base import TopicModelBase, register_model
from topik.intermediaries.raw_data import load_persisted_corpus
Expand Down Expand Up @@ -197,24 +198,17 @@ def get_top_words(self, topn):
top_words.append([(topic[word_id], self._corpus.get_id2word_dict()[word_id]) for word_id in word_ids])
return top_words

def _get_doc_data(self):
raise NotImplementedError

def _get_term_data(self):
raise NotImplementedError

def _get_vocab(self):
raise NotImplementedError

def _get_term_frequency(self):
raise NotImplementedError

def _get_topic_term_dists(self):
raise NotImplementedError
term_topic_df = pd.DataFrame(self.zw,
index=['topic'+str(t)+'dist' for t in range(self.topics)]).T

def _get_doc_topic_dists(self):
raise NotImplementedError
term_topic_df.index.name = 'term_id'
return term_topic_df

def _get_doc_lengths(self):
raise NotImplementedError
def _get_doc_topic_dists(self):
doc_topic_df = pd.DataFrame(self.dz,
index=[doc[0] for doc in self._corpus._corpus],
columns=['topic'+str(t)+'dist' for t in range(self.topics)])

doc_topic_df.index.name = 'doc_id'
return doc_topic_df
27 changes: 6 additions & 21 deletions topik/run.py
Expand Up @@ -10,8 +10,7 @@

from topik.readers import read_input
import topik.models
from topik.viz import Termite
from topik.utils import to_r_ldavis, generate_csv_output_file
from topik.viz import plot_lda_vis, Termite


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
Expand All @@ -22,7 +21,7 @@

def run_model(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
content_field=None, tokenizer='simple', n_topics=10, dir_path='./topic_model', model='LDA',
termite_plot=True, output_file=False, r_ldavis=False, seed=42, **kwargs):
termite_plot=True, output_file=False, ldavis=False, seed=42, **kwargs):

"""Run your data through all topik functionality and save all results to a specified directory.
Expand Down Expand Up @@ -54,7 +53,7 @@ def run_model(data_source, source_type="auto", year_field=None, start_year=None,
Generate termite plot of your model if True. Default is True.
output_file : bool
Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.
r_ldavis : bool
ldavis : bool
Generate an interactive data visualization of your topics. Default is False.
seed : int
Set random number generator to seed, to be able to reproduce results. Default 42.
Expand All @@ -74,21 +73,7 @@ def run_model(data_source, source_type="auto", year_field=None, start_year=None,
termite = Termite(model.termite_data(n_topics), "Termite Plot")
termite.plot(os.path.join(dir_path, 'termite.html'))

if output_file:
filtered_documents = raw_data.get_data_by_year(start_year, stop_year, year_field)
generate_csv_output_file(filtered_documents, raw_data, processed_data, lda.model)

if r_ldavis:
to_r_ldavis(processed_data, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda)
os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis')
try:
subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')])
except ValueError:
logging.warning("Unable to run runLDAvis.R")
os.chdir(os.path.join(dir_path, 'ldavis', 'output'))
sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000'])
webbrowser.open_new_tab('127.0.0.1:8000')
time.sleep(3)
sp.kill()
os.chdir(os.path.dirname(BASEDIR))
if ldavis:
plot_lda_vis(model.to_py_lda_vis())


91 changes: 0 additions & 91 deletions topik/utils.py

This file was deleted.

0 comments on commit 13c08fd

Please sign in to comment.