fix cli example

ContinuumIO · Oct 8, 2015 · d43e616 · d43e616
1 parent 0521bfe
commit d43e616
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 125 deletions.
diff --git a/docs/example.rst b/docs/example.rst
@@ -39,10 +39,36 @@ For quick, one-off studies, the command line interface allows you to specify
 minimal information and obtain topic model plot output. For all available
 options, please run ``topik --help``
 
+.. code-block:: bash
+
+    $ topik --help
+
+    Usage: topik [OPTIONS]
+
+      Run topic modeling
+
+    Options:
+      -d, --data TEXT        Path to input data for topic modeling  [required]
+      -f, --format TEXT      Data format provided: json_stream, folder_files,
+                             large_json, elastic, solr
+      -m, --model TEXT       Statistical topic model: lda_batch, lda_online
+      -o, --output TEXT      Topic modeling output path
+      -t, --tokenizer TEXT   Tokenize method to use: simple, collocations,
+                             entities, mix
+      -n, --ntopics INTEGER  Number of topics to find
+      --prefix_value TEXT    In 'large json' files, the prefix_value to extract
+                             text from
+      --event_value TEXT     In 'large json' files the event_value to extract text
+                             from
+      --field TEXT           In 'json stream' files, the field to extract text
+                             from
+      --help                 Show this message and exit.
+
+To run this on our movie reviews data set:
+
 .. code-block:: shell
 
-   $ topik --help
-   $ topik reviews
+   $ topik -d reviews
 
 The shell command is a front end to :func:`~.run_model`, which is also
 accessible in python:

diff --git a/docs/index.rst b/docs/index.rst
@@ -50,60 +50,6 @@ features into an easy callable function and a command line interface.
 - Be an easy and beginner-friendly module to contribute to.
 
 
-Getting Started
-===============
-
-To demonstrate the ease of a typical `topik` workflow, we'll provide two examples: using the command line
-interface and using the method :func:`topik.run.run_model`.
-
-- Using the command line interface
-
-To get help you can always type ``topik --help``.
-
-.. code-block:: bash
-
-    $ topik --help
-
-    Usage: topik [OPTIONS]
-
-      Run topic modeling
-
-    Options:
-      -d, --data TEXT        Path to input data for topic modeling  [required]
-      -f, --format TEXT      Data format provided: json_stream, folder_files,
-                             large_json  [required]
-      -m, --model TEXT       Statistical topic model: lda_batch, lda_online
-      -o, --output TEXT      Topic modeling output path
-      -t, --tokenizer TEXT   Tokenize method to use: simple, collocations,
-                             entities, mix
-      -n, --ntopics INTEGER  Number of topics to find
-      --prefix_value TEXT    In 'large json' files, the prefix_value to extract
-                             text from
-      --event_value TEXT     In 'large json' files the event_value to extract text
-                             from
-      --field TEXT           In 'json stream' files, the field to extract text
-                             from
-      --help                 Show this message and exit.
-
-
-The following example runs the default model LDA(batch) over a json stream, extracting the field `text` with simple
-word tokenization.
-
-.. code-block:: bash
-
-    $ topik -d ./topik/tests/data/test-data-1.json -f json_stream -o ./test -n 3 --field text -t entities
-
-- Using :func:``topik.run.run_model``
-
-The same previous example using :func:`run_model` would be:
-
-.. code-block:: python
-
-   >>> from topik.run import run_topic_model
-   >>> run_topic_model(data='./topik/tests/data/test-data-1.json', format='json_stream', n_topics=3, field='text', \
-                       dir_path='./topic_model')
-
-
 Contents
 ========
 

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -24,20 +24,3 @@ There is also the option of just installing the Python features with pip.
 
    The pip installation option will not provide all the available features, e.g. the LDAvis R package will not be
    available.
-
-
-
-Requirements
-============
-
-`Topik`'s requirements are:
-
-* gensim
-* pattern
-* textblob
-* nltk
-* pandas
-* blaze
-* bokeh
-* numpy
-* into
diff --git a/topik/cli.py b/topik/cli.py
@@ -5,20 +5,17 @@
 
 @click.command(help='Run topic modeling')
 @click.option("-d", "--data", required=True, help="Path to input data for topic modeling")
-@click.option("-f", "--format", required=True, help="Data format provided: "
-                 "json_stream, folder_files, large_json, solr")
-@click.option("-m", "--model", help="Statistical topic model: lda_batch, lda_online", default="lda_batch")
+@click.option("-f", "--format", default="auto", help="Data format provided: "
+                 "json_stream, folder_files, large_json, solr, elastic")
+@click.option("-m", "--model", help="Statistical topic model: lda, plsa", default="LDA")
 @click.option("-o", "--output", help="Topic modeling output path", default="./topic_model")
 @click.option("-t", "--tokenizer", help="Tokenize method to use: "
                 "simple, collocations, entities, mix", default='simple')
 @click.option("-n", "--ntopics", help="Number of topics to find", default=10)
-@click.option("--prefix_value", help="In 'large json' format, the prefix_value to extract text from", default=None)
-@click.option("--event_value", help="In 'large json' format the event_value to extract text from", default=None)
-@click.option("--field", help="In 'json stream' and 'solr' formats, the field to extract text from", default=None)
-@click.option("--query", help="In 'solr' format, an optional solr query", default='*:*')
-@click.option("--index", help="In 'elastic' format, the index to use", default=None)
-@click.option("--subfield", help="In 'elastic' format, if the content is in a nested structure, the subfield name", default=None)
-def run(data, format, output, tokenizer, ntopics, prefix_value, event_value, field, model, query, index, subfield):
-    run_model(data=data, format=format, dir_path=output, tokenizer=tokenizer,n_topics=ntopics,
-                       prefix_value=prefix_value, event_value=event_value, field=field, model=model,
-                       query=query, index=index, subfield=subfield)
+@click.option("--field", help="the field to extract text from, or for folders, the field to store text as",
+              default="text")
+@click.option("--termite", help="Whether to output a termite plot as a result", default=True)
+@click.option("--ldavis", help="Whether to output an LDAvis-type plot as a result", default=False)
+def run(data, format, output, tokenizer, ntopics, field, model, termite, ldavis):
+    run_model(data_source=data, source_type=format, dir_path=output, tokenizer=tokenizer, n_topics=ntopics,
+              content_field=field, model=model, r_ldavis=ldavis, termite_plot=termite)
diff --git a/topik/run.py b/topik/run.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from topik.readers import read_input
-from topik.models import LDA
+import topik.models
 from topik.viz import Termite
 from topik.utils import to_r_ldavis, generate_csv_output_file
 
@@ -21,7 +21,7 @@
 
 
 def run_model(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
-              content_field=None, tokenizer='simple', n_topics=10, dir_path='./topic_model', model='lda_batch',
+              content_field=None, tokenizer='simple', n_topics=10, dir_path='./topic_model', model='LDA',
               termite_plot=True, output_file=False, r_ldavis=False, seed=42, **kwargs):
 
     """Run your data through all topik functionality and save all results to a specified directory.
@@ -34,73 +34,44 @@ def run_model(data_source, source_type="auto", year_field=None, start_year=None,
     source_type : {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
         The format of your data input. Currently available a json stream or a folder containing text files.
         Default is 'json_stream'
-
     year_field : str
         The field name (if any) that contains the year associated with each document (for filtering).
-
     start_year : int
         For beginning of range filter on year_field values
-
     stop_year : int
         For beginning of range filter on year_field values
-
     content_field : string
         The primary text field to parse.
-
-    clear_es_index : bool
-        On true, delete and re-create destination elasticsearch index prior to loading in new documents.  Otherwise leave any previously
-        existing documents and just add/update with the new documents.
-
     tokenizer : {'simple', 'collocations', 'entities', 'mixed'}
         The type of tokenizer to use. Default is 'simple'.
-
     n_topics : int
         Number of topics to find in your data
-
     dir_path : str
         Directory path to store all topic modeling results files. Default is `./topic_model`.
-
-    model : {'lda_batch', 'lda_online'}.
-        Statistical modeling algorithm to use. Default 'lda_batch'.
-
+    model : {'LDA', 'PLSA'}.
+        Statistical modeling algorithm to use. Default 'LDA'.
     termite_plot : bool
         Generate termite plot of your model if True. Default is True.
-
     output_file : bool
         Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic.
-
     r_ldavis : bool
         Generate an interactive data visualization of your topics. Default is False.
-
-    json_prefix : str
-        For 'large json' format reader, the prefix value to parse.
-
     seed : int
         Set random number generator to seed, to be able to reproduce results. Default 42.
+    **kwargs : additional keyword arguments, passed through to each individual step
     """
 
     np.random.seed(seed)
 
     raw_data = read_input(data_source, content_field=content_field,
                           source_type=source_type, **kwargs)
     processed_data = raw_data.tokenize(method=tokenizer, **kwargs)
+    model = topik.models.registered_models[model](processed_data, n_topics, **kwargs)
+    if not os.path.exists(dir_path):
+        os.mkdir(dir_path)
 
-    # Serialize and store the corpus
-    # Create LDA model from corpus and dictionary
-    if model == 'lda_batch':
-        # To perform lda in batch mode set update_every=0 and passes=20)
-        # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation
-        lda = LDA(processed_data, n_topics, update_every=0, passes=20)
-    elif model == 'lda_online':
-        # To perform lda in online mode set variables update_every, chunksize and passes.
-        lda = LDA(processed_data, n_topics, update_every=1,
-                  chunksize=10000, passes=1)
-    else:
-        logging.warning('model provided not valid. Using lda_batch.')
-        lda = LDA(processed_data, n_topics, update_every=0, passes=20)
-    # Get termite plot for this model
     if termite_plot:
-        termite = Termite(lda.termite_data(n_topics), "Termite Plot")
+        termite = Termite(model.termite_data(n_topics), "Termite Plot")
         termite.plot(os.path.join(dir_path, 'termite.html'))
 
     if output_file:

diff --git a/topik/viz.py b/topik/viz.py
@@ -59,5 +59,4 @@ def plot(self, output_file="termite.html"):
                        title=self.title)
 
         p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
-        logging.info("generating termite plot for file %s" % self.input_file)
         plt.show(p)