add multithreading hint, Fix #62

DARIAH-DE · Jul 24, 2018 · d17658f · d17658f
1 parent e0a3112
commit d17658f
Showing 1 changed file with 81 additions and 28 deletions.
diff --git a/notebooks/IntroducingMallet.ipynb b/notebooks/IntroducingMallet.ipynb
@@ -73,7 +73,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from cophi_toolbox import preprocessing\n",
@@ -92,7 +94,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import metadata_toolbox.utils as metadata\n",
@@ -110,7 +114,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import warnings\n",
@@ -143,7 +149,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "path_to_corpus = Path('data', 'grenzboten_sample')"
@@ -173,7 +181,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pattern = '{author}_{year}_{title}'"
@@ -190,7 +200,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])\n",
@@ -207,7 +219,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "corpus = list(preprocessing.read_files(meta.index))\n",
@@ -239,7 +253,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "tokenized_corpus = [list(preprocessing.tokenize(document)) for document in corpus]"
@@ -255,7 +271,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "tokenized_corpus[0][0:13]"
@@ -282,7 +300,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "document_term_matrix, document_ids, type_ids = preprocessing.create_document_term_matrix(tokenized_corpus,\n",
@@ -302,7 +322,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "document_term_matrix = preprocessing.create_document_term_matrix(tokenized_corpus, meta['title'])\n",
@@ -336,7 +358,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "stopwords = preprocessing.list_mfw(document_term_matrix, most_frequent_tokens=100)"
@@ -352,7 +376,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "stopwords[:5]"
@@ -368,7 +394,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "hapax_legomena = preprocessing.find_hapax_legomena(document_term_matrix)\n",
@@ -386,7 +414,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "path_to_stopwordlist = Path('data', 'stopwords', 'de.txt')\n",
@@ -403,7 +433,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "features = stopwords + hapax_legomena + external_stopwords\n",
@@ -431,7 +463,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "path_to_mallet = 'mallet'"
@@ -449,7 +483,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "Mallet = utils.Mallet(path_to_mallet)"
@@ -465,7 +501,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "mallet_corpus = Mallet.import_tokenized_corpus(clean_tokenized_corpus, meta['title'])"
@@ -527,7 +565,7 @@
     "\n",
     "So, now you know how to define the number of topics and the number of sampling iterations as well. A higher number of iterations will probably yield a better model, but also increases processing time. `alpha` and `beta` are so-called *hyperparameters*. They influence the model's performance, so feel free to play around with them. In the present example, we will leave the default values. Furthermore, there exist various methods for hyperparameter optimization, e.g. gridsearch or Gaussian optimization.\n",
     "\n",
-    "**Warning: This step can take quite a while!** Meaning something between some seconds and some hours depending on corpus size and the number of iterations. Our example corpus should be done within a minute or two at `num_iterations=1000`."
+    "**Warning: This step can take quite a while!** Meaning something between some seconds and some hours depending on corpus size and the number of iterations. Our example corpus should be done within a minute or two at `num_iterations=1000`. You can however increment the number of threads (`num_threads`, defaults to 1) for parallel training."
    ]
   },
   {
@@ -540,7 +578,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "output = Path('data', 'mallet_output')\n",
@@ -552,7 +592,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "%%time\n",
@@ -561,6 +603,7 @@
     "                    output_topic_keys=str(Path(output, 'topic_keys.txt')),\n",
     "                    output_doc_topics=str(Path(output, 'doc_topics.txt')),\n",
     "                    num_topics=10,\n",
+    "                    num_threads=1,\n",
     "                    num_iterations=1000)"
    ]
   },
@@ -583,7 +626,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "topics = postprocessing.show_topics(topic_keys_file=str(Path(output, 'topic_keys.txt')))\n",
@@ -609,7 +654,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "document_topics = postprocessing.show_document_topics(topics=topics,\n",
@@ -636,7 +683,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from bokeh.io import output_notebook, show\n",
@@ -647,7 +696,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "PlotDocumentTopics = visualization.PlotDocumentTopics(document_topics)\n",
@@ -664,7 +715,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "static_heatmap = PlotDocumentTopics.static_heatmap()\n",
@@ -691,7 +744,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.5.1"
   }
  },
  "nbformat": 4,