add start of PLSA output to LDAvis

ContinuumIO · Oct 9, 2015 · 101e912 · 101e912
1 parent a6c55c2
commit 101e912
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 60 deletions.
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -30,10 +30,6 @@ requirements:
     - gensim
     - pattern
     - textblob
-    - r
-    - r-ldavis
-    - r-matrix
-    - r-data.table
     - click
     - solrpy
     - elasticsearch
@@ -47,10 +43,6 @@ requirements:
     - gensim
     - pattern
     - textblob
-    - r
-    - r-ldavis
-    - r-matrix
-    - r-data.table
     - click
     - solrpy
     - elasticsearch

diff --git a/topik/models/lda.py b/topik/models/lda.py
@@ -85,19 +85,6 @@ def _get_term_data(self):
         term_data_df.index.name = 'token_id'
         return term_data_df
 
-    def _get_vocab(self):
-        return self._corpus._dict.values()
-
-    def _get_term_frequency(self):
-        self._corpus._dict.save_as_text(os.path.join(test_data_path, 'dictionary'),
-                                      sort_by_word=False)
-        # TODO: see gensim source to see how it's saving this to file, then use that
-
-        df = pd.read_csv(os.path.join(test_data_path, 'dictionary'), sep='\t',
-                         index_col=0, header=None)
-        df = df.sort_index()
-        return df[2]
-
     def _get_topic_term_dists(self):
         term_topic_df = pd.DataFrame()
         for topic_no in range(self._model.num_topics):
@@ -110,11 +97,6 @@ def _get_topic_term_dists(self):
         term_topic_df.index.name = 'topics'
         return term_topic_df
 
-    def _get_doc_data(self):
-        doc_data_df = self._get_doc_topic_dists()
-        doc_data_df['doc_length'] = self._get_doc_lengths()
-        return doc_data_df
-
     def _get_doc_topic_dists(self):
         id_index, bow_corpus = zip(*[(id, self._corpus._dict.doc2bow(doc_tokens))
                               for id, doc_tokens in self._corpus._corpus])
@@ -129,8 +111,3 @@ def _get_doc_topic_dists(self):
         doc_topic_dists_df.columns = ['topic'+str(i)+'dist' for i in range(
                                                 doc_topic_dists_df.shape[1])]
         return doc_topic_dists_df
-
-    def _get_doc_lengths(self):
-        id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
-                                                        self._corpus._corpus)])
-        return pd.Series(doc_lengths, index=id_index)
diff --git a/topik/models/model_base.py b/topik/models/model_base.py
@@ -77,32 +77,39 @@ def get_model_name_with_parameters(self):
         """Abstract method.  Primarily internal function, used to name configurations in persisted metadata for later retrieval."""
         raise NotImplementedError
 
-    @abstractmethod
-    def _get_term_data(self):
-        raise NotImplementedError
-
-    @abstractmethod
     def _get_vocab(self):
-        raise NotImplementedError
+        return self._corpus._dict.values()
 
-    @abstractmethod
     def _get_term_frequency(self):
-        raise NotImplementedError
+        self._corpus._dict.save_as_text(os.path.join(test_data_path, 'dictionary'),
+                                      sort_by_word=False)
+        # TODO: see gensim source to see how it's saving this to file, then use that
 
-    @abstractmethod
-    def _get_topic_term_dists(self):
-        raise NotImplementedError
+        df = pd.read_csv(os.path.join(test_data_path, 'dictionary'), sep='\t',
+                         index_col=0, header=None)
+        df = df.sort_index()
+        return df[2]
 
-    @abstractmethod
     def _get_doc_data(self):
+        doc_data_df = self._get_doc_topic_dists()
+        doc_data_df['doc_length'] = self._get_doc_lengths()
+        return doc_data_df
+
+    def _get_doc_lengths(self):
+        id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
+                                                        self._corpus._corpus)])
+        return pd.Series(doc_lengths, index=id_index)
+
+    @abstractmethod
+    def _get_term_data(self):
         raise NotImplementedError
 
     @abstractmethod
-    def _get_doc_topic_dists(self):
+    def _get_topic_term_dists(self):
         raise NotImplementedError
 
     @abstractmethod
-    def _get_doc_lengths(self):
+    def _get_doc_topic_dists(self):
         raise NotImplementedError
 
     def to_py_lda_vis(self):

diff --git a/topik/models/plsa.py b/topik/models/plsa.py
@@ -197,24 +197,11 @@ def get_top_words(self, topn):
             top_words.append([(topic[word_id], self._corpus.get_id2word_dict()[word_id]) for word_id in word_ids])
         return top_words
 
-    def _get_doc_data(self):
-        raise NotImplementedError
-
     def _get_term_data(self):
         raise NotImplementedError
 
-    def _get_vocab(self):
-        raise NotImplementedError
-
-    def _get_term_frequency(self):
-        raise NotImplementedError
-
     def _get_topic_term_dists(self):
-        raise NotImplementedError
+        return self.zw
 
     def _get_doc_topic_dists(self):
-        raise NotImplementedError
-
-    def _get_doc_lengths(self):
-        raise NotImplementedError
-
+        return self.dz