Merge 3f938c7 into a6c55c2

ContinuumIO · Oct 9, 2015 · 13c08fd · 13c08fd
2 parents a6c55c2 + 3f938c7
commit 13c08fd
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 209 deletions.
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -30,10 +30,6 @@ requirements:
     - gensim
     - pattern
     - textblob
-    - r
-    - r-ldavis
-    - r-matrix
-    - r-data.table
     - click
     - solrpy
     - elasticsearch
@@ -47,10 +43,6 @@ requirements:
     - gensim
     - pattern
     - textblob
-    - r
-    - r-ldavis
-    - r-matrix
-    - r-data.table
     - click
     - solrpy
     - elasticsearch

diff --git a/topik/models/lda.py b/topik/models/lda.py
@@ -65,72 +65,28 @@ def get_top_words(self, topn):
     def get_model_name_with_parameters(self):
         return "LDA_{}_topics{}".format(self._model.num_topics, self._corpus.filter_string)
 
-    def _get_term_data(self):
-        term_doc_count_df = pd.DataFrame.from_records([{'tokenid': tokenid, 'token': token,
-                                           'doc_count': self._corpus._dict.dfs.get(tokenid, 0)}
-                    for tokenid, token in self._corpus._dict.id2token.items()], index="tokenid")
-
+    def _get_topic_term_dists(self):
         term_topic_df = pd.DataFrame([
                 pd.DataFrame.from_records(self._model.show_topic(topic_no, None),
                                          columns=['topic' + str(topic_no) + 'dist', 'token'],
                                          index='token')['topic' + str(topic_no) + 'dist']
                 for topic_no in range(self._model.num_topics)]).T
-
-        token2id_df = pd.DataFrame(self._corpus._dict.token2id.items())
-        token2id_df = token2id_df.set_index(0)
-        term_topic_df = pd.concat([term_topic_df, token2id_df], axis=1)
-        term_topic_df = term_topic_df.set_index(1)
-
-        term_data_df = pd.concat([term_doc_count_df, term_topic_df], axis=1)
-        term_data_df.index.name = 'token_id'
-        return term_data_df
-
-    def _get_vocab(self):
-        return self._corpus._dict.values()
-
-    def _get_term_frequency(self):
-        self._corpus._dict.save_as_text(os.path.join(test_data_path, 'dictionary'),
-                                      sort_by_word=False)
-        # TODO: see gensim source to see how it's saving this to file, then use that
-
-        df = pd.read_csv(os.path.join(test_data_path, 'dictionary'), sep='\t',
-                         index_col=0, header=None)
-        df = df.sort_index()
-        return df[2]
-
-    def _get_topic_term_dists(self):
-        term_topic_df = pd.DataFrame()
-        for topic_no in range(self._model.num_topics):
-            topic_df = pd.DataFrame(self._model.show_topic(topic_no, None))
-            topic_df = topic_df.set_index(1)
-            topic_df.columns = ['topic' + str(topic_no)]
-            term_topic_df = pd.concat([term_topic_df, topic_df], axis=1)
-        term_topic_df = term_topic_df.T
-        term_topic_df.columns.name = 'terms'
-        term_topic_df.index.name = 'topics'
+        term_topic_df['term_id'] = pd.Series(dict(self._corpus._dict.token2id.items()))
+        term_topic_df = term_topic_df.set_index('term_id')
         return term_topic_df
 
-    def _get_doc_data(self):
-        doc_data_df = self._get_doc_topic_dists()
-        doc_data_df['doc_length'] = self._get_doc_lengths()
-        return doc_data_df
-
     def _get_doc_topic_dists(self):
         id_index, bow_corpus = zip(*[(id, self._corpus._dict.doc2bow(doc_tokens))
                               for id, doc_tokens in self._corpus._corpus])
 
-        doc_topic_dists = list(self._model[bow_corpus])
+        doc_topic = list(self._model[bow_corpus])
 
-        for i, doc in enumerate(doc_topic_dists):
+        for i, doc in enumerate(doc_topic):
             for j, topic in enumerate(doc):
-                doc_topic_dists[i][j] = doc_topic_dists[i][j][1]
-
-        doc_topic_dists_df = pd.DataFrame(doc_topic_dists, index=id_index)
-        doc_topic_dists_df.columns = ['topic'+str(i)+'dist' for i in range(
-                                                doc_topic_dists_df.shape[1])]
-        return doc_topic_dists_df
+                doc_topic[i][j] = doc_topic[i][j][1]
 
-    def _get_doc_lengths(self):
-        id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
-                                                        self._corpus._corpus)])
-        return pd.Series(doc_lengths, index=id_index)
+        doc_topic_df = pd.DataFrame(doc_topic, index=id_index)
+        doc_topic_df.columns = ['topic'+str(i)+'dist' for i in range(
+                                                doc_topic_df.shape[1])]
+        doc_topic_df.index.name = 'doc_id'
+        return doc_topic_df
diff --git a/topik/models/model_base.py b/topik/models/model_base.py
@@ -1,4 +1,5 @@
 from abc import ABCMeta, abstractmethod
+from collections import Counter
 import logging
 
 import pandas as pd
@@ -77,41 +78,49 @@ def get_model_name_with_parameters(self):
         """Abstract method.  Primarily internal function, used to name configurations in persisted metadata for later retrieval."""
         raise NotImplementedError
 
-    @abstractmethod
     def _get_term_data(self):
-        raise NotImplementedError
+        vocab = self._get_vocab()
+        tf = self._get_term_frequency()
+        ttd = self._get_topic_term_dists()
+        term_data_df = ttd
+        term_data_df['term_frequency'] = tf
+        term_data_df['term'] = vocab
+        return term_data_df
 
-    @abstractmethod
     def _get_vocab(self):
-        raise NotImplementedError
+        return pd.Series(dict(self._corpus._dict.items()))
 
-    @abstractmethod
     def _get_term_frequency(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def _get_topic_term_dists(self):
-        raise NotImplementedError
+        tf = Counter()
+        [tf.update(dict(doc)) for doc in self._corpus]
+        # TODO update term documents in intermediate store
+        return pd.Series(dict(tf))
 
-    @abstractmethod
     def _get_doc_data(self):
-        raise NotImplementedError
+        doc_data_df = self._get_doc_topic_dists()
+        doc_data_df['doc_length'] = self._get_doc_lengths()
+        return doc_data_df
+
+    def _get_doc_lengths(self):
+        id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list(
+                                                        self._corpus._corpus)])
+        return pd.Series(doc_lengths, index=id_index)
 
     @abstractmethod
-    def _get_doc_topic_dists(self):
+    def _get_topic_term_dists(self):
         raise NotImplementedError
 
     @abstractmethod
-    def _get_doc_lengths(self):
+    def _get_doc_topic_dists(self):
         raise NotImplementedError
 
     def to_py_lda_vis(self):
         doc_data_df = self._get_doc_data()
         term_data_df = self._get_term_data()
 
-        model_lda_vis_data = {  'vocab': term_data_df['token'],
-                                'term_frequency': term_data_df['doc_count'],
-                                'topic_term_dists': term_data_df.iloc[:,2:].T,
+        model_lda_vis_data = {  'vocab': term_data_df['term'],
+                                'term_frequency': term_data_df['term_frequency'],
+                                'topic_term_dists': term_data_df.iloc[:,:-2].T,
                                 'doc_topic_dists': doc_data_df.iloc[:,:-1],
                                 'doc_lengths': doc_data_df['doc_length']}
         return model_lda_vis_data

diff --git a/topik/models/plsa.py b/topik/models/plsa.py
@@ -6,6 +6,7 @@
 import random
 
 import numpy as np
+import pandas as pd
 
 from .model_base import TopicModelBase, register_model
 from topik.intermediaries.raw_data import load_persisted_corpus
@@ -197,24 +198,17 @@ def get_top_words(self, topn):
             top_words.append([(topic[word_id], self._corpus.get_id2word_dict()[word_id]) for word_id in word_ids])
         return top_words
 
-    def _get_doc_data(self):
-        raise NotImplementedError
-
-    def _get_term_data(self):
-        raise NotImplementedError
-
-    def _get_vocab(self):
-        raise NotImplementedError
-
-    def _get_term_frequency(self):
-        raise NotImplementedError
-
     def _get_topic_term_dists(self):
-        raise NotImplementedError
+        term_topic_df = pd.DataFrame(self.zw,
+                            index=['topic'+str(t)+'dist' for t in range(self.topics)]).T
 
-    def _get_doc_topic_dists(self):
-        raise NotImplementedError
+        term_topic_df.index.name = 'term_id'
+        return term_topic_df
 
-    def _get_doc_lengths(self):
-        raise NotImplementedError
+    def _get_doc_topic_dists(self):
+        doc_topic_df = pd.DataFrame(self.dz,
+                            index=[doc[0] for doc in self._corpus._corpus],
+                            columns=['topic'+str(t)+'dist' for t in range(self.topics)])
 
+        doc_topic_df.index.name = 'doc_id'
+        return doc_topic_df
diff --git a/topik/run.py b/topik/run.py
@@ -10,8 +10,7 @@
 
 from topik.readers import read_input
 import topik.models
-from topik.viz import Termite
-from topik.utils import to_r_ldavis, generate_csv_output_file
+from topik.viz import plot_lda_vis, Termite
 
 
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
@@ -22,7 +21,7 @@
 
 def run_model(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
               content_field=None, tokenizer='simple', n_topics=10, dir_path='./topic_model', model='LDA',
-              termite_plot=True, output_file=False, r_ldavis=False, seed=42, **kwargs):
+              termite_plot=True, output_file=False, ldavis=False, seed=42, **kwargs):
 
     """Run your data through all topik functionality and save all results to a specified directory.
 
@@ -54,7 +53,7 @@ def run_model(data_source, source_type="auto", year_field=None, start_year=None,
         Generate termite plot of your model if True. Default is True.
     output_file : bool
         Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.
-    r_ldavis : bool
+    ldavis : bool
         Generate an interactive data visualization of your topics. Default is False.
     seed : int
         Set random number generator to seed, to be able to reproduce results. Default 42.
@@ -74,21 +73,7 @@ def run_model(data_source, source_type="auto", year_field=None, start_year=None,
         termite = Termite(model.termite_data(n_topics), "Termite Plot")
         termite.plot(os.path.join(dir_path, 'termite.html'))
 
-    if output_file:
-        filtered_documents = raw_data.get_data_by_year(start_year, stop_year, year_field)
-        generate_csv_output_file(filtered_documents, raw_data, processed_data, lda.model)
-
-    if r_ldavis:
-        to_r_ldavis(processed_data, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda)
-        os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis')
-        try:
-            subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')])
-        except ValueError:
-            logging.warning("Unable to run runLDAvis.R")
-        os.chdir(os.path.join(dir_path, 'ldavis', 'output'))
-        sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000'])
-        webbrowser.open_new_tab('127.0.0.1:8000')
-        time.sleep(3)
-        sp.kill()
-    os.chdir(os.path.dirname(BASEDIR))
+    if ldavis:
+        plot_lda_vis(model.to_py_lda_vis())
+
 
diff --git a/topik/utils.py b/topik/utils.py