Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
add start of PLSA output to LDAvis
Browse files Browse the repository at this point in the history
  • Loading branch information
msarahan committed Oct 9, 2015
1 parent a6c55c2 commit 101e912
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 60 deletions.
8 changes: 0 additions & 8 deletions conda.recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ requirements:
- gensim
- pattern
- textblob
- r
- r-ldavis
- r-matrix
- r-data.table
- click
- solrpy
- elasticsearch
Expand All @@ -47,10 +43,6 @@ requirements:
- gensim
- pattern
- textblob
- r
- r-ldavis
- r-matrix
- r-data.table
- click
- solrpy
- elasticsearch
Expand Down
23 changes: 0 additions & 23 deletions topik/models/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,19 +85,6 @@ def _get_term_data(self):
term_data_df.index.name = 'token_id'
return term_data_df

def _get_vocab(self):
return self._corpus._dict.values()

def _get_term_frequency(self):
self._corpus._dict.save_as_text(os.path.join(test_data_path, 'dictionary'),
sort_by_word=False)
# TODO: see gensim source to see how it's saving this to file, then use that

df = pd.read_csv(os.path.join(test_data_path, 'dictionary'), sep='\t',
index_col=0, header=None)
df = df.sort_index()
return df[2]

def _get_topic_term_dists(self):
term_topic_df = pd.DataFrame()
for topic_no in range(self._model.num_topics):
Expand All @@ -110,11 +97,6 @@ def _get_topic_term_dists(self):
term_topic_df.index.name = 'topics'
return term_topic_df

def _get_doc_data(self):
doc_data_df = self._get_doc_topic_dists()
doc_data_df['doc_length'] = self._get_doc_lengths()
return doc_data_df

def _get_doc_topic_dists(self):
id_index, bow_corpus = zip(*[(id, self._corpus._dict.doc2bow(doc_tokens))
for id, doc_tokens in self._corpus._corpus])
Expand All @@ -129,8 +111,3 @@ def _get_doc_topic_dists(self):
doc_topic_dists_df.columns = ['topic'+str(i)+'dist' for i in range(
doc_topic_dists_df.shape[1])]
return doc_topic_dists_df

def _get_doc_lengths(self):
id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
self._corpus._corpus)])
return pd.Series(doc_lengths, index=id_index)
35 changes: 21 additions & 14 deletions topik/models/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,32 +77,39 @@ def get_model_name_with_parameters(self):
"""Abstract method. Primarily internal function, used to name configurations in persisted metadata for later retrieval."""
raise NotImplementedError

@abstractmethod
def _get_term_data(self):
raise NotImplementedError

@abstractmethod
def _get_vocab(self):
raise NotImplementedError
return self._corpus._dict.values()

@abstractmethod
def _get_term_frequency(self):
raise NotImplementedError
self._corpus._dict.save_as_text(os.path.join(test_data_path, 'dictionary'),
sort_by_word=False)
# TODO: see gensim source to see how it's saving this to file, then use that

@abstractmethod
def _get_topic_term_dists(self):
raise NotImplementedError
df = pd.read_csv(os.path.join(test_data_path, 'dictionary'), sep='\t',
index_col=0, header=None)
df = df.sort_index()
return df[2]

@abstractmethod
def _get_doc_data(self):
doc_data_df = self._get_doc_topic_dists()
doc_data_df['doc_length'] = self._get_doc_lengths()
return doc_data_df

def _get_doc_lengths(self):
id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
self._corpus._corpus)])
return pd.Series(doc_lengths, index=id_index)

@abstractmethod
def _get_term_data(self):
raise NotImplementedError

@abstractmethod
def _get_doc_topic_dists(self):
def _get_topic_term_dists(self):
raise NotImplementedError

@abstractmethod
def _get_doc_lengths(self):
def _get_doc_topic_dists(self):
raise NotImplementedError

def to_py_lda_vis(self):
Expand Down
17 changes: 2 additions & 15 deletions topik/models/plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,24 +197,11 @@ def get_top_words(self, topn):
top_words.append([(topic[word_id], self._corpus.get_id2word_dict()[word_id]) for word_id in word_ids])
return top_words

def _get_doc_data(self):
raise NotImplementedError

def _get_term_data(self):
raise NotImplementedError

def _get_vocab(self):
raise NotImplementedError

def _get_term_frequency(self):
raise NotImplementedError

def _get_topic_term_dists(self):
raise NotImplementedError
return self.zw

def _get_doc_topic_dists(self):
raise NotImplementedError

def _get_doc_lengths(self):
raise NotImplementedError

return self.dz

0 comments on commit 101e912

Please sign in to comment.