diff --git a/IntroducingLda.ipynb b/IntroducingLda.ipynb index 81a6e17..db17531 100755 --- a/IntroducingLda.ipynb +++ b/IntroducingLda.ipynb @@ -91,9 +91,8 @@ "outputs": [], "source": [ "from dariah_topics import preprocessing\n", - "from dariah_topics import doclist\n", "from dariah_topics import meta\n", - "from dariah_topics import mallet\n", + "#from dariah_topics import postprocessing\n", "from dariah_topics import visualization" ] }, @@ -110,7 +109,6 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "from bokeh.io import show\n", "import lda" ] @@ -136,23 +134,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. Preprocessing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.2. Reading a corpus of documents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Defining the path to the corpus folder\n", - "\n", - "In the present example code, we are using a folder of 'txt' documents provided with the package. For using your own corpus, change the path accordingly." + "The following line will just tell the notebook to show graphics in the output frames." ] }, { @@ -161,173 +143,62 @@ "metadata": {}, "outputs": [], "source": [ - "path = \"grenzboten_sample\"" + "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### List all documents in the folder\n", - "We begin by creating a list of all the documents in the folder specified above. That list will tell function `pre.read_from_txt()` (see below) which text documents to read." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "pathdoclist = doclist.PathDocList(path)\n", - "document_list = pathdoclist.full_paths(as_str=True)" + "## 1. Preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The current list of documents looks like this:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['grenzboten_sample/Grenzboten_1844_Tagebuch_56.txt',\n", - " 'grenzboten_sample/Grenzboten_1846_Tagebuch_82.txt',\n", - " 'grenzboten_sample/Grenzboten_1916_Kriegstagebuch_69.txt',\n", - " 'grenzboten_sample/Grenzboten_1915_Kriegstagebuch_73.txt',\n", - " 'grenzboten_sample/Grenzboten_1914_Kriegstagebuch_95.txt',\n", - " 'grenzboten_sample/Grenzboten_1915_Kriegstagebuch_33.txt',\n", - " 'grenzboten_sample/Grenzboten_1914_Kriegstagebuch_68.txt',\n", - " 'grenzboten_sample/Grenzboten_1846_Tagebuch_51.txt',\n", - " 'grenzboten_sample/Grenzboten_1845_Tagebuch_81.txt',\n", - " 'grenzboten_sample/Grenzboten_1844_Tagebuch_82.txt',\n", - " 'grenzboten_sample/Grenzboten_1916_Kriegstagebuch_48.txt',\n", - " 'grenzboten_sample/Grenzboten_1915_Kriegstagebuch_94.txt',\n", - " 'grenzboten_sample/Grenzboten_1915_Kriegstagebuch_39.txt',\n", - " 'grenzboten_sample/Grenzboten_1845_Tagebuch_85.txt',\n", - " 'grenzboten_sample/Grenzboten_1846_Tagebuch_96.txt',\n", - " 'grenzboten_sample/Grenzboten_1845_Tagebuch_93.txt',\n", - " 'grenzboten_sample/Grenzboten_1916_Kriegstagebuch_81.txt',\n", - " 'grenzboten_sample/Grenzboten_1845_Tagebuch_62.txt',\n", - " 'grenzboten_sample/Grenzboten_1844_Tagebuch_77.txt',\n", - " 'grenzboten_sample/Grenzboten_1914_Kriegstagebuch_97.txt',\n", - " 'grenzboten_sample/Grenzboten_1916_Kriegstagebuch_41.txt',\n", - " 'grenzboten_sample/Grenzboten_1916_Kriegstagebuch_49.txt',\n", - " 'grenzboten_sample/Grenzboten_1844_Tagebuch_70.txt',\n", - " 'grenzboten_sample/Grenzboten_1914_Kriegstagebuch_37.txt',\n", - " 'grenzboten_sample/Grenzboten_1844_Tagebuch_88.txt',\n", - " 'grenzboten_sample/Grenzboten_1845_Tagebuch_52.txt',\n", - " 'grenzboten_sample/Grenzboten_1915_Kriegstagebuch_99.txt',\n", - " 'grenzboten_sample/Grenzboten_1914_Kriegstagebuch_94.txt',\n", - " 'grenzboten_sample/Grenzboten_1846_Tagebuch_88.txt',\n", - " 'grenzboten_sample/Grenzboten_1846_Tagebuch_72.txt']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "document_list" + "### 1.2. Reading a corpus of documents" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Alternatively**, if we want to use other documents, or just a selction of those in the specified folder, we can define our own `doclist` by creating a list of strings containing paths to text files. For example, to use only the texts from 1916, we would define the list as\n", + "#### Defining the path to the corpus folder\n", "\n", - "`\n", - " doclist = ['grenzboten_sample/grenzboten_1916_Kriegstagebuch_41.txt',\n", - " 'grenzboten_sample/grenzboten_1916_Kriegstagebuch_48.txt',\n", - " 'grenzboten_sample/grenzboten_1916_Kriegstagebuch_49.txt',\n", - " 'grenzboten_sample/grenzboten_1916_Kriegstagebuch_69.txt',\n", - " 'grenzboten_sample/grenzboten_1916_Kriegstagebuch_81.txt']\n", - "`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Generate document labels" + "In the present example code, we are using the 30 diary excerpts from the folder `grenzboten`. To use your own corpus, change the path accordingly." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Grenzboten_1844_Tagebuch_56',\n", - " 'Grenzboten_1846_Tagebuch_82',\n", - " 'Grenzboten_1916_Kriegstagebuch_69',\n", - " 'Grenzboten_1915_Kriegstagebuch_73',\n", - " 'Grenzboten_1914_Kriegstagebuch_95',\n", - " 'Grenzboten_1915_Kriegstagebuch_33',\n", - " 'Grenzboten_1914_Kriegstagebuch_68',\n", - " 'Grenzboten_1846_Tagebuch_51',\n", - " 'Grenzboten_1845_Tagebuch_81',\n", - " 'Grenzboten_1844_Tagebuch_82',\n", - " 'Grenzboten_1916_Kriegstagebuch_48',\n", - " 'Grenzboten_1915_Kriegstagebuch_94',\n", - " 'Grenzboten_1915_Kriegstagebuch_39',\n", - " 'Grenzboten_1845_Tagebuch_85',\n", - " 'Grenzboten_1846_Tagebuch_96',\n", - " 'Grenzboten_1845_Tagebuch_93',\n", - " 'Grenzboten_1916_Kriegstagebuch_81',\n", - " 'Grenzboten_1845_Tagebuch_62',\n", - " 'Grenzboten_1844_Tagebuch_77',\n", - " 'Grenzboten_1914_Kriegstagebuch_97',\n", - " 'Grenzboten_1916_Kriegstagebuch_41',\n", - " 'Grenzboten_1916_Kriegstagebuch_49',\n", - " 'Grenzboten_1844_Tagebuch_70',\n", - " 'Grenzboten_1914_Kriegstagebuch_37',\n", - " 'Grenzboten_1844_Tagebuch_88',\n", - " 'Grenzboten_1845_Tagebuch_52',\n", - " 'Grenzboten_1915_Kriegstagebuch_99',\n", - " 'Grenzboten_1914_Kriegstagebuch_94',\n", - " 'Grenzboten_1846_Tagebuch_88',\n", - " 'Grenzboten_1846_Tagebuch_72']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "document_labels = pathdoclist.labels()\n", - "document_labels" + "path_to_corpus = 'grenzboten_sample/*.txt'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Optional: Accessing metadata" + "By adding `/*.txt` to the actual path, we make sure to select only files with the suffix `.txt`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In case you want a more structured overview of your corpus, execute the following cell:" + "#### Accessing file paths and metadata\n", + "We begin by creating a list of all the documents in the folder specified above. That list will tell the function `preprocessing.read_from_pathlist` (see below) which text documents to read. Furthermore, based on filenames we can create some metadata, e.g. author and title." ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 6, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -359,292 +230,67 @@ " \n", " \n", " 0\n", - " Grenzboten\n", - " Grenzboten_1844_Tagebuch_56\n", - " grenzboten_sample/Grenzboten_1844_Tagebuch_56.txt\n", + " Beck\n", + " Beck_1844_Tagebuch_56\n", + " grenzboten_sample/Beck_1844_Tagebuch_56.txt\n", " 1844_Tagebuch_56\n", " \n", " \n", " 1\n", - " Grenzboten\n", - " Grenzboten_1846_Tagebuch_82\n", - " grenzboten_sample/Grenzboten_1846_Tagebuch_82.txt\n", - " 1846_Tagebuch_82\n", + " Unbekannt\n", + " Unbekannt_1844_Tagebuch_70\n", + " grenzboten_sample/Unbekannt_1844_Tagebuch_70.txt\n", + " 1844_Tagebuch_70\n", " \n", " \n", " 2\n", - " Grenzboten\n", - " Grenzboten_1916_Kriegstagebuch_69\n", - " grenzboten_sample/Grenzboten_1916_Kriegstagebu...\n", - " 1916_Kriegstagebuch_69\n", + " Nimmer\n", + " Nimmer_1844_Tagebuch_77\n", + " grenzboten_sample/Nimmer_1844_Tagebuch_77.txt\n", + " 1844_Tagebuch_77\n", " \n", " \n", " 3\n", - " Grenzboten\n", - " Grenzboten_1915_Kriegstagebuch_73\n", - " grenzboten_sample/Grenzboten_1915_Kriegstagebu...\n", - " 1915_Kriegstagebuch_73\n", - " \n", - " \n", - " 4\n", - " Grenzboten\n", - " Grenzboten_1914_Kriegstagebuch_95\n", - " grenzboten_sample/Grenzboten_1914_Kriegstagebu...\n", - " 1914_Kriegstagebuch_95\n", - " \n", - " \n", - " 5\n", - " Grenzboten\n", - " Grenzboten_1915_Kriegstagebuch_33\n", - " grenzboten_sample/Grenzboten_1915_Kriegstagebu...\n", - " 1915_Kriegstagebuch_33\n", - " \n", - " \n", - " 6\n", - " Grenzboten\n", - " Grenzboten_1914_Kriegstagebuch_68\n", - " grenzboten_sample/Grenzboten_1914_Kriegstagebu...\n", - " 1914_Kriegstagebuch_68\n", - " \n", - " \n", - " 7\n", - " Grenzboten\n", - " Grenzboten_1846_Tagebuch_51\n", - " grenzboten_sample/Grenzboten_1846_Tagebuch_51.txt\n", - " 1846_Tagebuch_51\n", - " \n", - " \n", - " 8\n", - " Grenzboten\n", - " Grenzboten_1845_Tagebuch_81\n", - " grenzboten_sample/Grenzboten_1845_Tagebuch_81.txt\n", - " 1845_Tagebuch_81\n", - " \n", - " \n", - " 9\n", - " Grenzboten\n", - " Grenzboten_1844_Tagebuch_82\n", - " grenzboten_sample/Grenzboten_1844_Tagebuch_82.txt\n", + " Unbekannt\n", + " Unbekannt_1844_Tagebuch_82\n", + " grenzboten_sample/Unbekannt_1844_Tagebuch_82.txt\n", " 1844_Tagebuch_82\n", " \n", " \n", - " 10\n", - " Grenzboten\n", - " Grenzboten_1916_Kriegstagebuch_48\n", - " grenzboten_sample/Grenzboten_1916_Kriegstagebu...\n", - " 1916_Kriegstagebuch_48\n", - " \n", - " \n", - " 11\n", - " Grenzboten\n", - " Grenzboten_1915_Kriegstagebuch_94\n", - " grenzboten_sample/Grenzboten_1915_Kriegstagebu...\n", - " 1915_Kriegstagebuch_94\n", - " \n", - " \n", - " 12\n", - " Grenzboten\n", - " Grenzboten_1915_Kriegstagebuch_39\n", - " grenzboten_sample/Grenzboten_1915_Kriegstagebu...\n", - " 1915_Kriegstagebuch_39\n", - " \n", - " \n", - " 13\n", - " Grenzboten\n", - " Grenzboten_1845_Tagebuch_85\n", - " grenzboten_sample/Grenzboten_1845_Tagebuch_85.txt\n", - " 1845_Tagebuch_85\n", - " \n", - " \n", - " 14\n", - " Grenzboten\n", - " Grenzboten_1846_Tagebuch_96\n", - " grenzboten_sample/Grenzboten_1846_Tagebuch_96.txt\n", - " 1846_Tagebuch_96\n", - " \n", - " \n", - " 15\n", - " Grenzboten\n", - " Grenzboten_1845_Tagebuch_93\n", - " grenzboten_sample/Grenzboten_1845_Tagebuch_93.txt\n", - " 1845_Tagebuch_93\n", - " \n", - " \n", - " 16\n", - " Grenzboten\n", - " Grenzboten_1916_Kriegstagebuch_81\n", - " grenzboten_sample/Grenzboten_1916_Kriegstagebu...\n", - " 1916_Kriegstagebuch_81\n", - " \n", - " \n", - " 17\n", - " Grenzboten\n", - " Grenzboten_1845_Tagebuch_62\n", - " grenzboten_sample/Grenzboten_1845_Tagebuch_62.txt\n", - " 1845_Tagebuch_62\n", - " \n", - " \n", - " 18\n", - " Grenzboten\n", - " Grenzboten_1844_Tagebuch_77\n", - " grenzboten_sample/Grenzboten_1844_Tagebuch_77.txt\n", - " 1844_Tagebuch_77\n", - " \n", - " \n", - " 19\n", - " Grenzboten\n", - " Grenzboten_1914_Kriegstagebuch_97\n", - " grenzboten_sample/Grenzboten_1914_Kriegstagebu...\n", - " 1914_Kriegstagebuch_97\n", - " \n", - " \n", - " 20\n", - " Grenzboten\n", - " Grenzboten_1916_Kriegstagebuch_41\n", - " grenzboten_sample/Grenzboten_1916_Kriegstagebu...\n", - " 1916_Kriegstagebuch_41\n", - " \n", - " \n", - " 21\n", - " Grenzboten\n", - " Grenzboten_1916_Kriegstagebuch_49\n", - " grenzboten_sample/Grenzboten_1916_Kriegstagebu...\n", - " 1916_Kriegstagebuch_49\n", - " \n", - " \n", - " 22\n", - " Grenzboten\n", - " Grenzboten_1844_Tagebuch_70\n", - " grenzboten_sample/Grenzboten_1844_Tagebuch_70.txt\n", - " 1844_Tagebuch_70\n", - " \n", - " \n", - " 23\n", - " Grenzboten\n", - " Grenzboten_1914_Kriegstagebuch_37\n", - " grenzboten_sample/Grenzboten_1914_Kriegstagebu...\n", - " 1914_Kriegstagebuch_37\n", - " \n", - " \n", - " 24\n", - " Grenzboten\n", - " Grenzboten_1844_Tagebuch_88\n", - " grenzboten_sample/Grenzboten_1844_Tagebuch_88.txt\n", + " 4\n", + " Jörgel\n", + " Jörgel_1844_Tagebuch_88\n", + " grenzboten_sample/Jörgel_1844_Tagebuch_88.txt\n", " 1844_Tagebuch_88\n", " \n", - " \n", - " 25\n", - " Grenzboten\n", - " Grenzboten_1845_Tagebuch_52\n", - " grenzboten_sample/Grenzboten_1845_Tagebuch_52.txt\n", - " 1845_Tagebuch_52\n", - " \n", - " \n", - " 26\n", - " Grenzboten\n", - " Grenzboten_1915_Kriegstagebuch_99\n", - " grenzboten_sample/Grenzboten_1915_Kriegstagebu...\n", - " 1915_Kriegstagebuch_99\n", - " \n", - " \n", - " 27\n", - " Grenzboten\n", - " Grenzboten_1914_Kriegstagebuch_94\n", - " grenzboten_sample/Grenzboten_1914_Kriegstagebu...\n", - " 1914_Kriegstagebuch_94\n", - " \n", - " \n", - " 28\n", - " Grenzboten\n", - " Grenzboten_1846_Tagebuch_88\n", - " grenzboten_sample/Grenzboten_1846_Tagebuch_88.txt\n", - " 1846_Tagebuch_88\n", - " \n", - " \n", - " 29\n", - " Grenzboten\n", - " Grenzboten_1846_Tagebuch_72\n", - " grenzboten_sample/Grenzboten_1846_Tagebuch_72.txt\n", - " 1846_Tagebuch_72\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " author basename \\\n", - "0 Grenzboten Grenzboten_1844_Tagebuch_56 \n", - "1 Grenzboten Grenzboten_1846_Tagebuch_82 \n", - "2 Grenzboten Grenzboten_1916_Kriegstagebuch_69 \n", - "3 Grenzboten Grenzboten_1915_Kriegstagebuch_73 \n", - "4 Grenzboten Grenzboten_1914_Kriegstagebuch_95 \n", - "5 Grenzboten Grenzboten_1915_Kriegstagebuch_33 \n", - "6 Grenzboten Grenzboten_1914_Kriegstagebuch_68 \n", - "7 Grenzboten Grenzboten_1846_Tagebuch_51 \n", - "8 Grenzboten Grenzboten_1845_Tagebuch_81 \n", - "9 Grenzboten Grenzboten_1844_Tagebuch_82 \n", - "10 Grenzboten Grenzboten_1916_Kriegstagebuch_48 \n", - "11 Grenzboten Grenzboten_1915_Kriegstagebuch_94 \n", - "12 Grenzboten Grenzboten_1915_Kriegstagebuch_39 \n", - "13 Grenzboten Grenzboten_1845_Tagebuch_85 \n", - "14 Grenzboten Grenzboten_1846_Tagebuch_96 \n", - "15 Grenzboten Grenzboten_1845_Tagebuch_93 \n", - "16 Grenzboten Grenzboten_1916_Kriegstagebuch_81 \n", - "17 Grenzboten Grenzboten_1845_Tagebuch_62 \n", - "18 Grenzboten Grenzboten_1844_Tagebuch_77 \n", - "19 Grenzboten Grenzboten_1914_Kriegstagebuch_97 \n", - "20 Grenzboten Grenzboten_1916_Kriegstagebuch_41 \n", - "21 Grenzboten Grenzboten_1916_Kriegstagebuch_49 \n", - "22 Grenzboten Grenzboten_1844_Tagebuch_70 \n", - "23 Grenzboten Grenzboten_1914_Kriegstagebuch_37 \n", - "24 Grenzboten Grenzboten_1844_Tagebuch_88 \n", - "25 Grenzboten Grenzboten_1845_Tagebuch_52 \n", - "26 Grenzboten Grenzboten_1915_Kriegstagebuch_99 \n", - "27 Grenzboten Grenzboten_1914_Kriegstagebuch_94 \n", - "28 Grenzboten Grenzboten_1846_Tagebuch_88 \n", - "29 Grenzboten Grenzboten_1846_Tagebuch_72 \n", - "\n", - " filename title \n", - "0 grenzboten_sample/Grenzboten_1844_Tagebuch_56.txt 1844_Tagebuch_56 \n", - "1 grenzboten_sample/Grenzboten_1846_Tagebuch_82.txt 1846_Tagebuch_82 \n", - "2 grenzboten_sample/Grenzboten_1916_Kriegstagebu... 1916_Kriegstagebuch_69 \n", - "3 grenzboten_sample/Grenzboten_1915_Kriegstagebu... 1915_Kriegstagebuch_73 \n", - "4 grenzboten_sample/Grenzboten_1914_Kriegstagebu... 1914_Kriegstagebuch_95 \n", - "5 grenzboten_sample/Grenzboten_1915_Kriegstagebu... 1915_Kriegstagebuch_33 \n", - "6 grenzboten_sample/Grenzboten_1914_Kriegstagebu... 1914_Kriegstagebuch_68 \n", - "7 grenzboten_sample/Grenzboten_1846_Tagebuch_51.txt 1846_Tagebuch_51 \n", - "8 grenzboten_sample/Grenzboten_1845_Tagebuch_81.txt 1845_Tagebuch_81 \n", - "9 grenzboten_sample/Grenzboten_1844_Tagebuch_82.txt 1844_Tagebuch_82 \n", - "10 grenzboten_sample/Grenzboten_1916_Kriegstagebu... 1916_Kriegstagebuch_48 \n", - "11 grenzboten_sample/Grenzboten_1915_Kriegstagebu... 1915_Kriegstagebuch_94 \n", - "12 grenzboten_sample/Grenzboten_1915_Kriegstagebu... 1915_Kriegstagebuch_39 \n", - "13 grenzboten_sample/Grenzboten_1845_Tagebuch_85.txt 1845_Tagebuch_85 \n", - "14 grenzboten_sample/Grenzboten_1846_Tagebuch_96.txt 1846_Tagebuch_96 \n", - "15 grenzboten_sample/Grenzboten_1845_Tagebuch_93.txt 1845_Tagebuch_93 \n", - "16 grenzboten_sample/Grenzboten_1916_Kriegstagebu... 1916_Kriegstagebuch_81 \n", - "17 grenzboten_sample/Grenzboten_1845_Tagebuch_62.txt 1845_Tagebuch_62 \n", - "18 grenzboten_sample/Grenzboten_1844_Tagebuch_77.txt 1844_Tagebuch_77 \n", - "19 grenzboten_sample/Grenzboten_1914_Kriegstagebu... 1914_Kriegstagebuch_97 \n", - "20 grenzboten_sample/Grenzboten_1916_Kriegstagebu... 1916_Kriegstagebuch_41 \n", - "21 grenzboten_sample/Grenzboten_1916_Kriegstagebu... 1916_Kriegstagebuch_49 \n", - "22 grenzboten_sample/Grenzboten_1844_Tagebuch_70.txt 1844_Tagebuch_70 \n", - "23 grenzboten_sample/Grenzboten_1914_Kriegstagebu... 1914_Kriegstagebuch_37 \n", - "24 grenzboten_sample/Grenzboten_1844_Tagebuch_88.txt 1844_Tagebuch_88 \n", - "25 grenzboten_sample/Grenzboten_1845_Tagebuch_52.txt 1845_Tagebuch_52 \n", - "26 grenzboten_sample/Grenzboten_1915_Kriegstagebu... 1915_Kriegstagebuch_99 \n", - "27 grenzboten_sample/Grenzboten_1914_Kriegstagebu... 1914_Kriegstagebuch_94 \n", - "28 grenzboten_sample/Grenzboten_1846_Tagebuch_88.txt 1846_Tagebuch_88 \n", - "29 grenzboten_sample/Grenzboten_1846_Tagebuch_72.txt 1846_Tagebuch_72 " + " author basename \\\n", + "0 Beck Beck_1844_Tagebuch_56 \n", + "1 Unbekannt Unbekannt_1844_Tagebuch_70 \n", + "2 Nimmer Nimmer_1844_Tagebuch_77 \n", + "3 Unbekannt Unbekannt_1844_Tagebuch_82 \n", + "4 Jörgel Jörgel_1844_Tagebuch_88 \n", + "\n", + " filename title \n", + "0 grenzboten_sample/Beck_1844_Tagebuch_56.txt 1844_Tagebuch_56 \n", + "1 grenzboten_sample/Unbekannt_1844_Tagebuch_70.txt 1844_Tagebuch_70 \n", + "2 grenzboten_sample/Nimmer_1844_Tagebuch_77.txt 1844_Tagebuch_77 \n", + "3 grenzboten_sample/Unbekannt_1844_Tagebuch_82.txt 1844_Tagebuch_82 \n", + "4 grenzboten_sample/Jörgel_1844_Tagebuch_88.txt 1844_Tagebuch_88 " ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "metadata = meta.fn2metadata(os.path.join(path, '*.txt'))\n", - "metadata" + "metadata = meta.fn2metadata(path_to_corpus)\n", + "metadata[:5] # by adding '[:5]' to the variable, only the first 5 elements will be printed" ] }, { @@ -656,18 +302,37 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Tagebuch von Karl Beck. Man spricht seit vierzehn Tagen von einem vollständigen Ministerwechsel und es circuliren im Publicum die verschiedensten Combinationen, wobei heute ganz andere Namen genannt werden, als gestern und morgen wieder andere, als heute.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "corpus = preprocessing.read_from_txt(document_list)" + "corpus = list(preprocessing.read_from_pathlist(metadata['filename']))\n", + "corpus[0][:255] # printing the first 255 characters of the first document" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "At this point, the corpus is generator object." + "Your `corpus` contains as much elements (`documents`) as texts in your corpus are. Each element of `corpus` is a list containing exactly one element, the text itself as one single string including all whitespaces and punctuations:\n", + "\n", + "```\n", + "[['This is the content of your first document.'],\n", + " ['This is the content of your second document.'],\n", + " ...\n", + " ['This is the content of your last document.']]\n", + "```" ] }, { @@ -675,51 +340,55 @@ "metadata": {}, "source": [ "### 1.3. Tokenize corpus\n", - "Your text files will be tokenized. Tokenization is the task of cutting a stream of characters into linguistic units, simply words or, more precisely, tokens. The tokenize function the library provides is a simple unicode tokenizer. Depending on the corpus it might be useful to use an external tokenizer function, or even develop your own, since its efficiency varies with language, epoch and text type." + "Now, your `documents` in `corpus` will be tokenized. Tokenization is the task of cutting a stream of characters into linguistic units, simply words or, more precisely, tokens. The tokenize function `dariah_topics` provides is a simple Unicode tokenizer. Depending on the corpus, it might be useful to use an external tokenizer function, or even develop your own, since its efficiency varies with language, epoch and text type." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "tokens = [list(preprocessing.tokenize(document)) for document in list(corpus)]" + "tokenized_corpus = [list(preprocessing.tokenize(document)) for document in corpus]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "At this point, each text is represented by a list of separate token strings. If we want to look e.g. into the first text (which has the index `0` as Python starts counting at 0) and show its first 10 words/tokens (that have the indeces `0:9` accordingly) by typing:" + "At this point, each `document` is represented by a list of separate token strings. As above, have a look at the first document (which has the index `0` as Python starts counting at 0) and show its first 14 words/tokens (that have the indices `0:13` accordingly)." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['es',\n", - " 'berlin',\n", - " 'und',\n", - " 'paris',\n", - " 'sprcchscligkeir',\n", - " 'credit',\n", - " 'und',\n", - " 'religion',\n", - " 'priester']" + "['tagebuch',\n", + " 'von',\n", + " 'karl',\n", + " 'beck',\n", + " 'man',\n", + " 'spricht',\n", + " 'seit',\n", + " 'vierzehn',\n", + " 'tagen',\n", + " 'von',\n", + " 'einem',\n", + " 'vollständigen',\n", + " 'ministerwechsel']" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tokens[0][0:9]" + "tokenized_corpus[0][0:13]" ] }, { @@ -728,12 +397,12 @@ "source": [ "### 1.4. Create a document-term matrix\n", "\n", - "The LDA topic model is based on a [document-term matrix](https://en.wikipedia.org/wiki/Document-term_matrix) of the corpus. To improve performance in large corpora, the matrix describes the frequency of terms that occur in the collection. In a document-term matrix, rows correspond to documents in the collection and columns correspond to terms." + "The LDA topic model is based on a [document-term matrix](https://en.wikipedia.org/wiki/Document-term_matrix) of the corpus. In a document-term matrix, rows correspond to documents and columns correspond to terms or tokens respectively. The values are token frequencies for each document." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": { "scrolled": false }, @@ -770,27 +439,27 @@ " des\n", " nicht\n", " ...\n", - " mördern\n", - " mühevolle\n", - " münch-bellinghausen\n", - " mühling\n", - " mühsame\n", - " mühsamen\n", - " müht\n", - " mül\n", - " müllers\n", - " a!s\n", + " staatsmonopol\n", + " steigernde\n", + " staatspapieren\n", + " staatsrücksichten\n", + " staatszeitung\n", + " stallknecht\n", + " stammen\n", + " starkem\n", + " statu\n", + " subscriben\n", " \n", " \n", " \n", " \n", - " Grenzboten_1844_Tagebuch_56\n", + " 1844_Tagebuch_56\n", " 90.0\n", " 92.0\n", - " 88.0\n", + " 84.0\n", " 70.0\n", " 30.0\n", - " 25.0\n", + " 26.0\n", " 25.0\n", " 16.0\n", " 25.0\n", @@ -808,41 +477,17 @@ " 0.0\n", " \n", " \n", - " Grenzboten_1846_Tagebuch_82\n", - " 319.0\n", - " 346.0\n", - " 275.0\n", - " 164.0\n", - " 106.0\n", - " 87.0\n", - " 110.0\n", - " 94.0\n", - " 75.0\n", - " 96.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " \n", - " \n", - " Grenzboten_1916_Kriegstagebuch_69\n", - " 39.0\n", + " 1844_Tagebuch_70\n", + " 205.0\n", + " 224.0\n", + " 193.0\n", + " 133.0\n", + " 78.0\n", " 64.0\n", - " 51.0\n", - " 24.0\n", - " 14.0\n", - " 28.0\n", - " 1.0\n", - " 7.0\n", - " 10.0\n", - " 1.0\n", + " 112.0\n", + " 86.0\n", + " 45.0\n", + " 67.0\n", " ...\n", " 0.0\n", " 0.0\n", @@ -851,22 +496,22 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 1.0\n", + " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " Grenzboten_1915_Kriegstagebuch_73\n", + " 1844_Tagebuch_77\n", + " 181.0\n", + " 153.0\n", + " 141.0\n", + " 98.0\n", + " 55.0\n", + " 59.0\n", + " 72.0\n", + " 59.0\n", + " 44.0\n", " 41.0\n", - " 51.0\n", - " 43.0\n", - " 31.0\n", - " 27.0\n", - " 7.0\n", - " 1.0\n", - " 7.0\n", - " 9.0\n", - " 1.0\n", " ...\n", " 0.0\n", " 0.0\n", @@ -880,41 +525,17 @@ " 0.0\n", " \n", " \n", - " Grenzboten_1914_Kriegstagebuch_95\n", - " 80.0\n", + " 1844_Tagebuch_82\n", + " 213.0\n", + " 207.0\n", + " 169.0\n", + " 128.0\n", " 85.0\n", - " 62.0\n", - " 65.0\n", - " 42.0\n", - " 35.0\n", - " 11.0\n", - " 13.0\n", - " 14.0\n", - " 8.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1915_Kriegstagebuch_33\n", - " 93.0\n", - " 95.0\n", - " 87.0\n", - " 78.0\n", - " 50.0\n", - " 48.0\n", - " 1.0\n", - " 8.0\n", - " 21.0\n", - " 4.0\n", + " 86.0\n", + " 79.0\n", + " 80.0\n", + " 66.0\n", + " 67.0\n", " ...\n", " 0.0\n", " 0.0\n", @@ -928,41 +549,17 @@ " 0.0\n", " \n", " \n", - " Grenzboten_1914_Kriegstagebuch_68\n", - " 32.0\n", - " 31.0\n", - " 26.0\n", - " 37.0\n", - " 15.0\n", + " 1844_Tagebuch_88\n", + " 68.0\n", + " 59.0\n", + " 70.0\n", + " 54.0\n", + " 22.0\n", + " 27.0\n", + " 28.0\n", + " 19.0\n", " 13.0\n", - " 2.0\n", - " 10.0\n", - " 16.0\n", - " 3.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1846_Tagebuch_51\n", - " 226.0\n", - " 177.0\n", - " 188.0\n", - " 111.0\n", - " 73.0\n", - " 62.0\n", - " 93.0\n", - " 60.0\n", - " 35.0\n", - " 78.0\n", + " 11.0\n", " ...\n", " 0.0\n", " 0.0\n", @@ -975,1359 +572,241 @@ " 0.0\n", " 0.0\n", " \n", - " \n", - " Grenzboten_1845_Tagebuch_81\n", - " 344.0\n", - " 351.0\n", - " 311.0\n", - " 178.0\n", - " 107.0\n", - " 118.0\n", - " 156.0\n", - " 116.0\n", - " 91.0\n", - " 112.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1844_Tagebuch_82\n", - " 213.0\n", - " 207.0\n", - " 169.0\n", - " 128.0\n", - " 85.0\n", - " 86.0\n", - " 79.0\n", - " 80.0\n", - " 66.0\n", - " 67.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1916_Kriegstagebuch_48\n", - " 47.0\n", - " 69.0\n", - " 64.0\n", - " 19.0\n", - " 17.0\n", - " 42.0\n", - " 0.0\n", - " 8.0\n", - " 16.0\n", - " 1.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1915_Kriegstagebuch_94\n", - " 11.0\n", - " 32.0\n", - " 24.0\n", - " 12.0\n", - " 8.0\n", - " 17.0\n", - " 0.0\n", - " 3.0\n", - " 5.0\n", - " 1.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1915_Kriegstagebuch_39\n", - " 43.0\n", - " 67.0\n", - " 64.0\n", - " 26.0\n", - " 28.0\n", - " 18.0\n", - " 1.0\n", - " 8.0\n", - " 9.0\n", - " 2.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1845_Tagebuch_85\n", - " 302.0\n", - " 299.0\n", - " 246.0\n", - " 174.0\n", - " 111.0\n", - " 89.0\n", - " 125.0\n", - " 73.0\n", - " 67.0\n", - " 61.0\n", - " ...\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1846_Tagebuch_96\n", - " 637.0\n", - " 456.0\n", - " 482.0\n", - " 299.0\n", - " 217.0\n", - " 170.0\n", - " 239.0\n", - " 173.0\n", - " 97.0\n", - " 169.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1845_Tagebuch_93\n", - " 215.0\n", - " 193.0\n", - " 176.0\n", - " 114.0\n", - " 82.0\n", - " 83.0\n", - " 92.0\n", - " 71.0\n", - " 65.0\n", - " 56.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1916_Kriegstagebuch_81\n", - " 39.0\n", - " 75.0\n", - " 64.0\n", - " 33.0\n", - " 26.0\n", - " 35.0\n", - " 3.0\n", - " 8.0\n", - " 16.0\n", - " 2.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1845_Tagebuch_62\n", - " 191.0\n", - " 239.0\n", - " 232.0\n", - " 159.0\n", - " 72.0\n", - " 86.0\n", - " 104.0\n", - " 77.0\n", - " 84.0\n", - " 82.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1844_Tagebuch_77\n", - " 181.0\n", - " 153.0\n", - " 141.0\n", - " 98.0\n", - " 55.0\n", - " 59.0\n", - " 72.0\n", - " 59.0\n", - " 44.0\n", - " 41.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1914_Kriegstagebuch_97\n", - " 21.0\n", - " 24.0\n", - " 20.0\n", - " 22.0\n", - " 8.0\n", - " 16.0\n", - " 3.0\n", - " 5.0\n", - " 3.0\n", - " 1.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1916_Kriegstagebuch_41\n", - " 37.0\n", - " 60.0\n", - " 53.0\n", - " 19.0\n", - " 17.0\n", - " 31.0\n", - " 1.0\n", - " 7.0\n", - " 13.0\n", - " 1.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1916_Kriegstagebuch_49\n", - " 80.0\n", - " 86.0\n", - " 81.0\n", - " 21.0\n", - " 34.0\n", - " 23.0\n", - " 1.0\n", - " 14.0\n", - " 29.0\n", - " 1.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1844_Tagebuch_70\n", - " 205.0\n", - " 224.0\n", - " 193.0\n", - " 133.0\n", - " 78.0\n", - " 64.0\n", - " 112.0\n", - " 86.0\n", - " 45.0\n", - " 67.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1914_Kriegstagebuch_37\n", - " 40.0\n", - " 34.0\n", - " 15.0\n", - " 17.0\n", - " 10.0\n", - " 19.0\n", - " 5.0\n", - " 6.0\n", - " 18.0\n", - " 3.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1844_Tagebuch_88\n", - " 68.0\n", - " 59.0\n", - " 70.0\n", - " 54.0\n", - " 22.0\n", - " 27.0\n", - " 28.0\n", - " 19.0\n", - " 13.0\n", - " 11.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1845_Tagebuch_52\n", - " 385.0\n", - " 351.0\n", - " 331.0\n", - " 215.0\n", - " 138.0\n", - " 142.0\n", - " 130.0\n", - " 114.0\n", - " 105.0\n", - " 93.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1915_Kriegstagebuch_99\n", - " 46.0\n", - " 47.0\n", - " 39.0\n", - " 29.0\n", - " 23.0\n", - " 39.0\n", - " 0.0\n", - " 8.0\n", - " 9.0\n", - " 1.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1914_Kriegstagebuch_94\n", - " 39.0\n", - " 48.0\n", - " 34.0\n", - " 28.0\n", - " 15.0\n", - " 25.0\n", - " 4.0\n", - " 5.0\n", - " 11.0\n", - " 3.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1846_Tagebuch_88\n", - " 126.0\n", - " 118.0\n", - " 116.0\n", - " 62.0\n", - " 43.0\n", - " 44.0\n", - " 46.0\n", - " 45.0\n", - " 20.0\n", - " 37.0\n", - " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " Grenzboten_1846_Tagebuch_72\n", - " 286.0\n", - " 288.0\n", - " 252.0\n", - " 192.0\n", - " 104.0\n", - " 115.0\n", - " 121.0\n", - " 74.0\n", - " 85.0\n", - " 85.0\n", - " ...\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - "\n", - "

30 rows × 24448 columns

\n", - "" - ], - "text/plain": [ - " die der und in den von \\\n", - "Grenzboten_1844_Tagebuch_56 90.0 92.0 88.0 70.0 30.0 25.0 \n", - "Grenzboten_1846_Tagebuch_82 319.0 346.0 275.0 164.0 106.0 87.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 39.0 64.0 51.0 24.0 14.0 28.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 41.0 51.0 43.0 31.0 27.0 7.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 80.0 85.0 62.0 65.0 42.0 35.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 93.0 95.0 87.0 78.0 50.0 48.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 32.0 31.0 26.0 37.0 15.0 13.0 \n", - "Grenzboten_1846_Tagebuch_51 226.0 177.0 188.0 111.0 73.0 62.0 \n", - "Grenzboten_1845_Tagebuch_81 344.0 351.0 311.0 178.0 107.0 118.0 \n", - "Grenzboten_1844_Tagebuch_82 213.0 207.0 169.0 128.0 85.0 86.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 47.0 69.0 64.0 19.0 17.0 42.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 11.0 32.0 24.0 12.0 8.0 17.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 43.0 67.0 64.0 26.0 28.0 18.0 \n", - "Grenzboten_1845_Tagebuch_85 302.0 299.0 246.0 174.0 111.0 89.0 \n", - "Grenzboten_1846_Tagebuch_96 637.0 456.0 482.0 299.0 217.0 170.0 \n", - "Grenzboten_1845_Tagebuch_93 215.0 193.0 176.0 114.0 82.0 83.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 39.0 75.0 64.0 33.0 26.0 35.0 \n", - "Grenzboten_1845_Tagebuch_62 191.0 239.0 232.0 159.0 72.0 86.0 \n", - "Grenzboten_1844_Tagebuch_77 181.0 153.0 141.0 98.0 55.0 59.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 21.0 24.0 20.0 22.0 8.0 16.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 37.0 60.0 53.0 19.0 17.0 31.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 80.0 86.0 81.0 21.0 34.0 23.0 \n", - "Grenzboten_1844_Tagebuch_70 205.0 224.0 193.0 133.0 78.0 64.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 40.0 34.0 15.0 17.0 10.0 19.0 \n", - "Grenzboten_1844_Tagebuch_88 68.0 59.0 70.0 54.0 22.0 27.0 \n", - "Grenzboten_1845_Tagebuch_52 385.0 351.0 331.0 215.0 138.0 142.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 46.0 47.0 39.0 29.0 23.0 39.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 39.0 48.0 34.0 28.0 15.0 25.0 \n", - "Grenzboten_1846_Tagebuch_88 126.0 118.0 116.0 62.0 43.0 44.0 \n", - "Grenzboten_1846_Tagebuch_72 286.0 288.0 252.0 192.0 104.0 115.0 \n", - "\n", - " zu das des nicht ... mördern \\\n", - "Grenzboten_1844_Tagebuch_56 25.0 16.0 25.0 23.0 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_82 110.0 94.0 75.0 96.0 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 1.0 7.0 10.0 1.0 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 1.0 7.0 9.0 1.0 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 11.0 13.0 14.0 8.0 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 1.0 8.0 21.0 4.0 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 2.0 10.0 16.0 3.0 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_51 93.0 60.0 35.0 78.0 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_81 156.0 116.0 91.0 112.0 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_82 79.0 80.0 66.0 67.0 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 8.0 16.0 1.0 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 3.0 5.0 1.0 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 1.0 8.0 9.0 2.0 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_85 125.0 73.0 67.0 61.0 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_96 239.0 173.0 97.0 169.0 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_93 92.0 71.0 65.0 56.0 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 3.0 8.0 16.0 2.0 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_62 104.0 77.0 84.0 82.0 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_77 72.0 59.0 44.0 41.0 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 3.0 5.0 3.0 1.0 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 1.0 7.0 13.0 1.0 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 1.0 14.0 29.0 1.0 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_70 112.0 86.0 45.0 67.0 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 5.0 6.0 18.0 3.0 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_88 28.0 19.0 13.0 11.0 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_52 130.0 114.0 105.0 93.0 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 8.0 9.0 1.0 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 4.0 5.0 11.0 3.0 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_88 46.0 45.0 20.0 37.0 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_72 121.0 74.0 85.0 85.0 ... 1.0 \n", - "\n", - " mühevolle münch-bellinghausen mühling \\\n", - "Grenzboten_1844_Tagebuch_56 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_51 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_81 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_85 1.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_96 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_93 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_62 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_70 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 0.0 1.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_88 0.0 0.0 1.0 \n", - "Grenzboten_1846_Tagebuch_72 0.0 0.0 0.0 \n", - "\n", - " mühsame mühsamen müht mül müllers a!s \n", - "Grenzboten_1844_Tagebuch_56 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 0.0 0.0 0.0 0.0 0.0 1.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 0.0 0.0 0.0 1.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_51 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_81 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_85 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_96 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_93 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_62 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_70 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_88 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_72 0.0 0.0 1.0 0.0 1.0 0.0 \n", - "\n", - "[30 rows x 24448 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "doc_terms = preprocessing.create_doc_term_matrix(tokens, document_labels)\n", - "doc_terms" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.5. Feature removal\n", - "\n", - "In topic modeling, it is often usefull (if not vital) to remove some types before modeling. In this example, the 100 most frequent words and the *hapax legomena* in the corpus will be removed. This step is very easy to handle using the benefits of indexing." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### List the 100 most frequent words" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "mfw100 = preprocessing.find_stopwords(doc_terms, 100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are the five most frequent words:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['die', 'der', 'und', 'in', 'den']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mfw100[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### List hapax legomena" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "hapax_list = preprocessing.find_hapax(doc_terms)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Optional: Use external stopwordlist" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "path_to_stopwordlist = \"tutorial_supplementals/stopwords/de.txt\"\n", - "\n", - "extern_stopwords = [line.strip() for line in open(path_to_stopwordlist, 'r')]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Combine lists and remove content from `doc_term_matrix`" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = set(mfw100 + hapax_list + extern_stopwords)\n", - "doc_terms = preprocessing.remove_features_from_df(doc_terms, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, this is how your clean corpus looks like now." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + "
franzosengenommenabgewiesensüdlichberlinlassengeschützeenglischejanuardeutschland...tilemanstausendmaltatengeldinstitutetatkraftgemeinemtausendenteilangriffetendenzstückgemeingefährlichkeitsmaßstab
Grenzboten_1844_Tagebuch_560.01.00.00.05.03.00.00.00.04.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1846_Tagebuch_824.02.00.00.01.07.00.01.01.04.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1916_Kriegstagebuch_6912.06.09.010.02.00.05.09.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1915_Kriegstagebuch_738.011.06.07.02.00.06.03.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1914_Kriegstagebuch_952.09.012.06.02.01.08.010.01.01.0...0.00.02.00.02.00.00.00.00.00.0
Grenzboten_1915_Kriegstagebuch_3317.013.024.08.04.01.016.011.086.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1914_Kriegstagebuch_688.03.05.03.02.00.03.03.01.03.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1846_Tagebuch_511.01.01.00.01.07.00.01.00.09.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1845_Tagebuch_811.03.00.00.07.010.00.00.00.010.0...0.00.00.02.00.02.02.00.00.00.0
Grenzboten_1844_Tagebuch_820.00.00.00.05.010.00.00.00.02.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1916_Kriegstagebuch_487.06.012.012.02.00.02.03.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1915_Kriegstagebuch_940.09.03.05.03.00.06.03.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1915_Kriegstagebuch_3912.06.013.06.02.00.014.05.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1845_Tagebuch_851.02.00.01.010.05.00.00.00.04.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1846_Tagebuch_9610.03.00.00.010.015.00.02.00.07.0...2.02.00.00.00.00.00.00.00.00.0
Grenzboten_1845_Tagebuch_931.01.00.00.05.08.00.00.00.05.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1916_Kriegstagebuch_816.06.019.015.01.00.03.012.00.03.0...0.00.00.00.00.00.00.02.00.00.0
Grenzboten_1845_Tagebuch_623.02.00.00.04.05.00.00.00.02.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

5 rows × 24451 columns

\n", + "
" + ], + "text/plain": [ + " die der und in den von zu das des \\\n", + "1844_Tagebuch_56 90.0 92.0 84.0 70.0 30.0 26.0 25.0 16.0 25.0 \n", + "1844_Tagebuch_70 205.0 224.0 193.0 133.0 78.0 64.0 112.0 86.0 45.0 \n", + "1844_Tagebuch_77 181.0 153.0 141.0 98.0 55.0 59.0 72.0 59.0 44.0 \n", + "1844_Tagebuch_82 213.0 207.0 169.0 128.0 85.0 86.0 79.0 80.0 66.0 \n", + "1844_Tagebuch_88 68.0 59.0 70.0 54.0 22.0 27.0 28.0 19.0 13.0 \n", + "\n", + " nicht ... staatsmonopol steigernde \\\n", + "1844_Tagebuch_56 23.0 ... 0.0 0.0 \n", + "1844_Tagebuch_70 67.0 ... 0.0 0.0 \n", + "1844_Tagebuch_77 41.0 ... 0.0 0.0 \n", + "1844_Tagebuch_82 67.0 ... 0.0 0.0 \n", + "1844_Tagebuch_88 11.0 ... 0.0 0.0 \n", + "\n", + " staatspapieren staatsrücksichten staatszeitung \\\n", + "1844_Tagebuch_56 0.0 0.0 0.0 \n", + "1844_Tagebuch_70 0.0 0.0 0.0 \n", + "1844_Tagebuch_77 0.0 0.0 0.0 \n", + "1844_Tagebuch_82 0.0 0.0 0.0 \n", + "1844_Tagebuch_88 0.0 0.0 0.0 \n", + "\n", + " stallknecht stammen starkem statu subscriben \n", + "1844_Tagebuch_56 0.0 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_70 0.0 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_77 0.0 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_82 0.0 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_88 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 24451 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "document_term_matrix = preprocessing.create_document_term_matrix(tokenized_corpus, metadata['title'])\n", + "document_term_matrix[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.5. Feature removal\n", + "\n", + "*Stopwords* (also known as *most frequent tokens*) and *hapax legomena* are harmful for LDA and have to be removed from the corpus or the document-term matrix respectively. In this example, the 50 most frequent tokens will be categorized as stopwords.\n", + "\n", + "**Hint**: Be careful with removing most frequent tokens, you might remove tokens quite important for LDA. Anyway, to gain better results, it is highly recommended to use an external stopwords list.\n", + "\n", + "In this notebook, we combine the 50 most frequent tokens, hapax legomena and an external stopwordslist." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### List the 100 most frequent tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "stopwords = preprocessing.find_stopwords(document_term_matrix, most_frequent_tokens=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These are the five most frequent words:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['die', 'der', 'und', 'in', 'den']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stopwords[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### List hapax legomena" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of types in corpus: 24451\n", + "Total number of hapax legomena: 19757\n" + ] + } + ], + "source": [ + "hapax_legomena = preprocessing.find_hapax_legomena(document_term_matrix)\n", + "print(\"Total number of types in corpus:\", document_term_matrix.shape[1])\n", + "print(\"Total number of hapax legomena:\", len(hapax_legomena))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Optional: Use external stopwordlist" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "path_to_stopwordlist = \"tutorial_supplementals/stopwords/de.txt\"\n", + "external_stopwords = [line.strip() for line in open(path_to_stopwordlist, 'r', encoding='utf-8')]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Combine lists and remove content from `document_term_matrix`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "features = stopwords + hapax_legomena + external_stopwords\n", + "document_term_matrix = preprocessing.remove_features(features, document_term_matrix=document_term_matrix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, this is how your clean corpus looks like now." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2341,7 +820,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2356,31 +835,7 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -2389,17 +844,17 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2413,41 +868,17 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -2461,65 +892,17 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -2534,244 +917,55 @@ " \n", " \n", "
franzosengenommenabgewiesensüdlichberlinlassengeschützeenglischejanuardeutschland...bankprojectsii&gtpflanzenausstellungunbekümmertnischtthiaumont-waldekompositionenmonarchischerehrenwerthenzaturcy
Grenzboten_1844_Tagebuch_770.01.00.00.05.09.00.02.02.02.0...0.00.00.00.00.00.00.00.00.01844_Tagebuch_560.0
Grenzboten_1914_Kriegstagebuch_972.03.03.01.02.00.03.03.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1916_Kriegstagebuch_415.011.07.018.03.00.04.03.010.00.00.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1916_Kriegstagebuch_4912.017.011.017.02.00.017.06.00.00.04.0...0.00.00.0
Grenzboten_1844_Tagebuch_701844_Tagebuch_702.01.00.00.00.00.00.00.00.00.00.00.02.0
Grenzboten_1914_Kriegstagebuch_376.01.00.00.00.00.05.01.00.01.0...0.00.00.00.00.00.00.00.0
Grenzboten_1844_Tagebuch_882.00.00.00.01844_Tagebuch_770.01.00.00.05.09.00.04.02.02.02.0...0.00.00.0
Grenzboten_1845_Tagebuch_524.01.00.00.08.08.00.00.02.012.0...0.01844_Tagebuch_820.00.00.00.05.010.00.00.00.02.00.0
Grenzboten_1915_Kriegstagebuch_994.012.04.06.03.00.08.03.00.00.0...0.00.00.0
Grenzboten_1914_Kriegstagebuch_944.03.02.03.02.00.01844_Tagebuch_882.07.00.01.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1846_Tagebuch_881.02.00.00.03.02.00.00.00.03.0...0.00.00.00.00.00.00.00.00.00.0
Grenzboten_1846_Tagebuch_724.00.00.00.010.010.00.02.02.017.0...0.00.0
\n", - "

30 rows × 4258 columns

\n", + "

5 rows × 4242 columns

\n", "
" ], "text/plain": [ - " franzosen genommen abgewiesen südlich \\\n", - "Grenzboten_1844_Tagebuch_56 0.0 1.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 4.0 2.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 12.0 6.0 9.0 10.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 8.0 11.0 6.0 7.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 2.0 9.0 12.0 6.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 17.0 13.0 24.0 8.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 8.0 3.0 5.0 3.0 \n", - "Grenzboten_1846_Tagebuch_51 1.0 1.0 1.0 0.0 \n", - "Grenzboten_1845_Tagebuch_81 1.0 3.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 7.0 6.0 12.0 12.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 9.0 3.0 5.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 12.0 6.0 13.0 6.0 \n", - "Grenzboten_1845_Tagebuch_85 1.0 2.0 0.0 1.0 \n", - "Grenzboten_1846_Tagebuch_96 10.0 3.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_93 1.0 1.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 6.0 6.0 19.0 15.0 \n", - "Grenzboten_1845_Tagebuch_62 3.0 2.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 0.0 1.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 2.0 3.0 3.0 1.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 5.0 11.0 7.0 18.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 12.0 17.0 11.0 17.0 \n", - "Grenzboten_1844_Tagebuch_70 2.0 1.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 6.0 1.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_88 2.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 4.0 1.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 4.0 12.0 4.0 6.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 4.0 3.0 2.0 3.0 \n", - "Grenzboten_1846_Tagebuch_88 1.0 2.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_72 4.0 0.0 0.0 0.0 \n", - "\n", - " berlin lassen geschütze englische \\\n", - "Grenzboten_1844_Tagebuch_56 5.0 3.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 1.0 7.0 0.0 1.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 2.0 0.0 5.0 9.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 2.0 0.0 6.0 3.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 2.0 1.0 8.0 10.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 4.0 1.0 16.0 11.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 2.0 0.0 3.0 3.0 \n", - "Grenzboten_1846_Tagebuch_51 1.0 7.0 0.0 1.0 \n", - "Grenzboten_1845_Tagebuch_81 7.0 10.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 5.0 10.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 2.0 0.0 2.0 3.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 3.0 0.0 6.0 3.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 2.0 0.0 14.0 5.0 \n", - "Grenzboten_1845_Tagebuch_85 10.0 5.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_96 10.0 15.0 0.0 2.0 \n", - "Grenzboten_1845_Tagebuch_93 5.0 8.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 1.0 0.0 3.0 12.0 \n", - "Grenzboten_1845_Tagebuch_62 4.0 5.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 5.0 9.0 0.0 2.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 2.0 0.0 3.0 3.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 3.0 0.0 3.0 10.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 2.0 0.0 17.0 6.0 \n", - "Grenzboten_1844_Tagebuch_70 8.0 6.0 0.0 3.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 0.0 5.0 1.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 1.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 8.0 8.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 3.0 0.0 8.0 3.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 2.0 0.0 2.0 7.0 \n", - "Grenzboten_1846_Tagebuch_88 3.0 2.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_72 10.0 10.0 0.0 2.0 \n", - "\n", - " januar deutschland \\\n", - "Grenzboten_1844_Tagebuch_56 0.0 4.0 \n", - "Grenzboten_1846_Tagebuch_82 1.0 4.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 1.0 1.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 86.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 1.0 3.0 \n", - "Grenzboten_1846_Tagebuch_51 0.0 9.0 \n", - "Grenzboten_1845_Tagebuch_81 0.0 10.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 2.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_85 0.0 4.0 \n", - "Grenzboten_1846_Tagebuch_96 0.0 7.0 \n", - "Grenzboten_1845_Tagebuch_93 0.0 5.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 0.0 3.0 \n", - "Grenzboten_1845_Tagebuch_62 0.0 2.0 \n", - "Grenzboten_1844_Tagebuch_77 2.0 2.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_70 0.0 1.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 1.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 4.0 \n", - "Grenzboten_1845_Tagebuch_52 2.0 12.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 0.0 1.0 \n", - "Grenzboten_1846_Tagebuch_88 0.0 3.0 \n", - "Grenzboten_1846_Tagebuch_72 2.0 17.0 \n", - "\n", - " ... tilemans \\\n", - "Grenzboten_1844_Tagebuch_56 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_82 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_51 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_81 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_82 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_85 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_96 ... 2.0 \n", - "Grenzboten_1845_Tagebuch_93 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_62 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_77 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 ... 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_70 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 ... 0.0 \n", - "Grenzboten_1844_Tagebuch_88 ... 0.0 \n", - "Grenzboten_1845_Tagebuch_52 ... 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 ... 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_88 ... 0.0 \n", - "Grenzboten_1846_Tagebuch_72 ... 0.0 \n", - "\n", - " tausendmal taten geldinstitute tatkraft \\\n", - "Grenzboten_1844_Tagebuch_56 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 0.0 2.0 0.0 2.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_51 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_81 0.0 0.0 2.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_85 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_96 2.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_93 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_62 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_70 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_88 0.0 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_72 0.0 0.0 0.0 0.0 \n", - "\n", - " gemeinem tausenden teilangriffe \\\n", - "Grenzboten_1844_Tagebuch_56 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_51 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_81 2.0 2.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_85 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_96 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_93 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 0.0 0.0 2.0 \n", - "Grenzboten_1845_Tagebuch_62 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 0.0 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_70 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 0.0 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_88 0.0 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_72 0.0 0.0 0.0 \n", - "\n", - " tendenzstück gemeingefährlichkeitsmaßstab \n", - "Grenzboten_1844_Tagebuch_56 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_82 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_69 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_73 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_95 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_33 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_68 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_51 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_81 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_82 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_48 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_94 0.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_39 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_85 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_96 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_93 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_81 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_62 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_77 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_97 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_41 0.0 0.0 \n", - "Grenzboten_1916_Kriegstagebuch_49 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_70 0.0 2.0 \n", - "Grenzboten_1914_Kriegstagebuch_37 0.0 0.0 \n", - "Grenzboten_1844_Tagebuch_88 0.0 0.0 \n", - "Grenzboten_1845_Tagebuch_52 2.0 0.0 \n", - "Grenzboten_1915_Kriegstagebuch_99 0.0 0.0 \n", - "Grenzboten_1914_Kriegstagebuch_94 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_88 0.0 0.0 \n", - "Grenzboten_1846_Tagebuch_72 0.0 0.0 \n", - "\n", - "[30 rows x 4258 columns]" + " franzosen genommen abgewiesen südlich berlin lassen \\\n", + "1844_Tagebuch_56 0.0 1.0 0.0 0.0 4.0 3.0 \n", + "1844_Tagebuch_70 2.0 1.0 0.0 0.0 8.0 6.0 \n", + "1844_Tagebuch_77 0.0 1.0 0.0 0.0 5.0 9.0 \n", + "1844_Tagebuch_82 0.0 0.0 0.0 0.0 5.0 10.0 \n", + "1844_Tagebuch_88 2.0 0.0 0.0 0.0 0.0 1.0 \n", + "\n", + " geschütze englische januar deutschland ... \\\n", + "1844_Tagebuch_56 0.0 0.0 0.0 4.0 ... \n", + "1844_Tagebuch_70 0.0 3.0 0.0 1.0 ... \n", + "1844_Tagebuch_77 0.0 2.0 2.0 2.0 ... \n", + "1844_Tagebuch_82 0.0 0.0 0.0 2.0 ... \n", + "1844_Tagebuch_88 0.0 0.0 0.0 4.0 ... \n", + "\n", + " bankprojects ii> pflanzenausstellung unbekümmert \\\n", + "1844_Tagebuch_56 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_70 0.0 0.0 0.0 2.0 \n", + "1844_Tagebuch_77 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_82 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_88 0.0 0.0 0.0 0.0 \n", + "\n", + " nischt thiaumont-walde kompositionen monarchischer \\\n", + "1844_Tagebuch_56 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_70 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_77 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_82 0.0 0.0 0.0 0.0 \n", + "1844_Tagebuch_88 0.0 0.0 0.0 0.0 \n", + "\n", + " ehrenwerthen zaturcy \n", + "1844_Tagebuch_56 0.0 0.0 \n", + "1844_Tagebuch_70 0.0 0.0 \n", + "1844_Tagebuch_77 0.0 0.0 \n", + "1844_Tagebuch_82 0.0 0.0 \n", + "1844_Tagebuch_88 0.0 0.0 \n", + "\n", + "[5 rows x 4242 columns]" ] }, - "execution_count": 18, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "doc_terms" + "document_term_matrix[:5]" ] }, { @@ -2787,84 +981,84 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.1. Translate document-term matrix into array" + "### 2.1. Creating list of vocabulary" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this step, all values of your document-term matrix will be translated into an [array](https://en.wikipedia.org/wiki/Array_data_structure)." + "To translate numbers back into words after the model creation, you have to set up a list of all unique tokens in the corpus." ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[ 0, 1, 0, ..., 0, 0, 0],\n", - " [ 4, 2, 0, ..., 0, 0, 0],\n", - " [12, 6, 9, ..., 0, 0, 0],\n", - " ..., \n", - " [ 4, 3, 2, ..., 0, 0, 0],\n", - " [ 1, 2, 0, ..., 0, 0, 0],\n", - " [ 4, 0, 0, ..., 0, 0, 0]])" + "Index(['franzosen', 'genommen', 'abgewiesen', 'südlich', 'berlin', 'lassen',\n", + " 'geschütze', 'englische', 'januar', 'deutschland',\n", + " ...\n", + " 'bankprojects', 'ii>', 'pflanzenausstellung', 'unbekümmert', 'nischt',\n", + " 'thiaumont-walde', 'kompositionen', 'monarchischer', 'ehrenwerthen',\n", + " 'zaturcy'],\n", + " dtype='object', length=4242)" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "doc_term_matrix = doc_terms.as_matrix().astype(int)\n", - "doc_term_matrix" + "vocabulary = document_term_matrix.columns\n", + "vocabulary" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.2. Creating list of vocabulary" + "### 2.2. Translate document-term matrix into an array" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To translate numbers back into words after the model creation, you have to set up a list of all unique tokens in the corpus." + "In this step, all values of your document-term matrix will be translated into an [array](https://en.wikipedia.org/wiki/Array_data_structure)." ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, + "execution_count": 18, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { "text/plain": [ - "Index(['franzosen', 'genommen', 'abgewiesen', 'südlich', 'berlin', 'lassen',\n", - " 'geschütze', 'englische', 'januar', 'deutschland',\n", - " ...\n", - " 'tilemans', 'tausendmal', 'taten', 'geldinstitute', 'tatkraft',\n", - " 'gemeinem', 'tausenden', 'teilangriffe', 'tendenzstück',\n", - " 'gemeingefährlichkeitsmaßstab'],\n", - " dtype='object', length=4258)" + "array([[ 0, 1, 0, ..., 0, 0, 0],\n", + " [ 2, 1, 0, ..., 0, 0, 0],\n", + " [ 0, 1, 0, ..., 0, 0, 0],\n", + " ..., \n", + " [12, 17, 11, ..., 0, 0, 0],\n", + " [12, 6, 9, ..., 0, 0, 0],\n", + " [ 6, 6, 19, ..., 0, 0, 0]])" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "vocab = doc_terms.columns\n", - "vocab" + "document_term_matrix_arr = document_term_matrix.as_matrix().astype(int)\n", + "document_term_matrix_arr" ] }, { @@ -2873,22 +1067,56 @@ "source": [ "### 2.3. Generate LDA model\n", "\n", - "We can define the number of topics we want to calculate as an argument (`n_topics`) in the function. Furthermore, the number of iterations can be defined. A higher number of passes will probably yield a better model, but also increases processing time.\n", + "We use the class `LDA` from the library `lda` (which is basically not the same, because Python is case sensitive) to generate a LDA topic model. To instance a `LDA` object, there have to be specified a couple of parameters.\n", + "\n", + "But first, if you are curious about any library, module, class or function, try `help()`. This can be very useful, because (at least in a well documented library) explanations of use and parameters will be printed. We're interested in the class `LDA` of the library `lda`, so let's try:\n", + "\n", + "```\n", + "help(lda.LDA)\n", + "```\n", + "\n", + "This will print something like this (in fact even more):\n", + "\n", + "```\n", + "Help on class LDA in module lda.lda:\n", + "\n", + "class LDA(builtins.object)\n", + " | Latent Dirichlet allocation using collapsed Gibbs sampling\n", + " | \n", + " | Parameters\n", + " | ----------\n", + " | n_topics : int\n", + " | Number of topics\n", + " | \n", + " | n_iter : int, default 2000\n", + " | Number of sampling iterations\n", + " | \n", + " | alpha : float, default 0.1\n", + " | Dirichlet parameter for distribution over topics\n", + " | \n", + " | eta : float, default 0.01\n", + " | Dirichlet parameter for distribution over words\n", + " | \n", + " | random_state : int or RandomState, optional\n", + " | The generator used for the initial topics.\n", + "```\n", + "\n", + "So, now you know how to define the number of topics and the number of sampling iterations as well. A higher number of iterations will probably yield a better model, but also increases processing time. `alpha`, `eta` and `random_state` are so-called *hyperparameters*. They influence the model's performance, so feel free to play around with them. In the present example, we will leave the default values. Furthermore, there exist various methods for hyperparameter optimization, e.g. gridsearch or Gaussian optimization.\n", "\n", - "**Warning: this step can take quite a while!** Meaning something between some seconds and some hours depending on corpus size and the number of iterations. Our example short stories corpus should be done within a minute or two at `n_iter=5000`." + "**Warning: This step can take quite a while!** Meaning something between some seconds and some hours depending on corpus size and the number of iterations. Our example corpus should be done within a minute or two at `n_iter=5000`." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 47.4 s, sys: 1.96 ms, total: 47.4 s\n", - "Wall time: 47.6 s\n" + "CPU times: user 53.6 s, sys: 17.9 ms, total: 53.7 s\n", + "Wall time: 53.9 s\n" ] } ], @@ -2896,7 +1124,7 @@ "%%time\n", "\n", "model = lda.LDA(n_topics=10, n_iter=5000)\n", - "model.fit(doc_term_matrix)" + "model.fit(document_term_matrix_arr)" ] }, { @@ -2910,220 +1138,21 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Key 1Key 2Key 3Key 4Key 5Key 6Key 7Key 8Key 9Key 10
Topic 1lassenweisefindenweltartnämlichsagenberlinlebenbringen
Topic 2kaiserberlinfriedrichtausendwienverfasserakademiegedichtelebenkarl
Topic 3hiesigenscheintstehtgewißgesellschaftstadtaltenseitezeitungindeß
Topic 4pressevolkeigentlichfrageregierungnamenjungebriefemännerstande
Topic 5geschützeabgewiesengenommenfranzosenjanuarabgeschlagenöstlichkriegstagebuchangriffverlusten
Topic 6septemberdezemberengländerösterreichertruppenenglischetürkenaugustgeschlagenengland
Topic 7frankreichfranzösischenlamennaisspracheanfangseinstseelekrakaubelgienbuch
Topic 8julimärzaprilsüdlichstellungenheftigegestürmtitalienerenglischemaas
Topic 9deutschlandoesterreichglaubenwissenwienerdeutscherberlinerpolitischenpublicumletzten
Topic 10leipzigfremdenwardsiehtständetheaterleipzigerzeitungdeutschstadt
\n", - "
" - ], - "text/plain": [ - " Key 1 Key 2 Key 3 Key 4 Key 5 \\\n", - "Topic 1 lassen weise finden welt art \n", - "Topic 2 kaiser berlin friedrich tausend wien \n", - "Topic 3 hiesigen scheint steht gewiß gesellschaft \n", - "Topic 4 presse volk eigentlich frage regierung \n", - "Topic 5 geschütze abgewiesen genommen franzosen januar \n", - "Topic 6 september dezember engländer österreicher truppen \n", - "Topic 7 frankreich französischen lamennais sprache anfangs \n", - "Topic 8 juli märz april südlich stellungen \n", - "Topic 9 deutschland oesterreich glauben wissen wiener \n", - "Topic 10 leipzig fremden ward sieht stände \n", - "\n", - " Key 6 Key 7 Key 8 Key 9 Key 10 \n", - "Topic 1 nämlich sagen berlin leben bringen \n", - "Topic 2 verfasser akademie gedichte leben karl \n", - "Topic 3 stadt alten seite zeitung indeß \n", - "Topic 4 namen junge briefe männer stande \n", - "Topic 5 abgeschlagen östlich kriegstagebuch angriff verlusten \n", - "Topic 6 englische türken august geschlagen england \n", - "Topic 7 einst seele krakau belgien buch \n", - "Topic 8 heftige gestürmt italiener englische maas \n", - "Topic 9 deutscher berliner politischen publicum letzten \n", - "Topic 10 theater leipziger zeitung deutsch stadt " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "topics = preprocessing.lda2dataframe(model, vocab)\n", - "topics" + "#topics = postprocessing.show_topics(document_term_matrix, model)\n", + "#topics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Model visualization and evaluation" + "## 3. Model visualization" ] }, { @@ -3135,563 +1164,12 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Grenzboten_1844_Tagebuch_56Grenzboten_1844_Tagebuch_70Grenzboten_1844_Tagebuch_77Grenzboten_1844_Tagebuch_82Grenzboten_1844_Tagebuch_88Grenzboten_1845_Tagebuch_52Grenzboten_1845_Tagebuch_62Grenzboten_1845_Tagebuch_81Grenzboten_1845_Tagebuch_85Grenzboten_1845_Tagebuch_93...Grenzboten_1915_Kriegstagebuch_33Grenzboten_1915_Kriegstagebuch_39Grenzboten_1915_Kriegstagebuch_73Grenzboten_1915_Kriegstagebuch_94Grenzboten_1915_Kriegstagebuch_99Grenzboten_1916_Kriegstagebuch_41Grenzboten_1916_Kriegstagebuch_48Grenzboten_1916_Kriegstagebuch_49Grenzboten_1916_Kriegstagebuch_69Grenzboten_1916_Kriegstagebuch_81
lassen weise finden0.2584790.2188520.2343220.2700000.1372580.2008400.2756770.1893880.2657100.588489...0.0098670.0001480.0147610.0003300.0001760.0049520.0250390.0001220.0028040.000134
kaiser berlin friedrich0.1966720.2667720.2580430.0706540.0810190.0511500.0409450.0196000.0976510.050868...0.0000890.0001480.0002080.0003300.0230230.0001600.0017110.0001220.0014690.000134
hiesigen scheint steht0.1475440.1513870.1223870.1693460.0704750.1672510.1354750.4590510.3424320.067588...0.0000890.0031160.0002080.0003300.0001760.0001600.0001560.0013450.0001340.001478
presse volk eigentlich0.0366090.0536570.0742030.1268630.0687170.1154070.0568770.1161460.0673800.052154...0.0045330.0075670.0022870.0003300.0001760.0001600.0001560.0001220.0001340.004167
geschütze abgewiesen genommen0.0001580.0000630.0000740.0000650.0001760.0000370.0000530.0042030.0005740.001994...0.7405330.8621660.6987530.7726070.6381370.2733230.3640750.3729830.1817090.234005
september dezember engländer0.0175910.0107820.0037810.0072550.0001760.0062430.0000530.0042030.0094470.012926...0.1787560.0698810.0688150.0795380.0845340.0544730.0328150.0331300.0869160.057930
frankreich französischen lamennais0.0144220.0095210.0304670.0203270.0001760.0270540.0308550.0262590.0271920.033505...0.0000890.0001480.0043660.0003300.0001760.0001600.0001560.0013450.0001340.000134
juli märz april0.0001580.0000630.0000740.0000650.0001760.0000370.0000530.0000420.0016180.001994...0.0632000.0535610.2081080.1389440.2532510.6551120.5755830.5905870.7264350.697715
deutschland oesterreich glauben0.2125200.2491170.1579690.1183660.5678380.2351590.1014870.1382020.1305320.140900...0.0027560.0001480.0002080.0003300.0001760.0001600.0001560.0001220.0001340.002823
leipzig fremden ward0.1158480.0397860.1186810.2170590.0739890.1968240.3585240.0429050.0574630.049582...0.0000890.0031160.0022870.0069310.0001760.0113420.0001560.0001220.0001340.001478
\n", - "

10 rows × 30 columns

\n", - "
" - ], - "text/plain": [ - " Grenzboten_1844_Tagebuch_56 \\\n", - "lassen weise finden 0.258479 \n", - "kaiser berlin friedrich 0.196672 \n", - "hiesigen scheint steht 0.147544 \n", - "presse volk eigentlich 0.036609 \n", - "geschütze abgewiesen genommen 0.000158 \n", - "september dezember engländer 0.017591 \n", - "frankreich französischen lamennais 0.014422 \n", - "juli märz april 0.000158 \n", - "deutschland oesterreich glauben 0.212520 \n", - "leipzig fremden ward 0.115848 \n", - "\n", - " Grenzboten_1844_Tagebuch_70 \\\n", - "lassen weise finden 0.218852 \n", - "kaiser berlin friedrich 0.266772 \n", - "hiesigen scheint steht 0.151387 \n", - "presse volk eigentlich 0.053657 \n", - "geschütze abgewiesen genommen 0.000063 \n", - "september dezember engländer 0.010782 \n", - "frankreich französischen lamennais 0.009521 \n", - "juli märz april 0.000063 \n", - "deutschland oesterreich glauben 0.249117 \n", - "leipzig fremden ward 0.039786 \n", - "\n", - " Grenzboten_1844_Tagebuch_77 \\\n", - "lassen weise finden 0.234322 \n", - "kaiser berlin friedrich 0.258043 \n", - "hiesigen scheint steht 0.122387 \n", - "presse volk eigentlich 0.074203 \n", - "geschütze abgewiesen genommen 0.000074 \n", - "september dezember engländer 0.003781 \n", - "frankreich französischen lamennais 0.030467 \n", - "juli märz april 0.000074 \n", - "deutschland oesterreich glauben 0.157969 \n", - "leipzig fremden ward 0.118681 \n", - "\n", - " Grenzboten_1844_Tagebuch_82 \\\n", - "lassen weise finden 0.270000 \n", - "kaiser berlin friedrich 0.070654 \n", - "hiesigen scheint steht 0.169346 \n", - "presse volk eigentlich 0.126863 \n", - "geschütze abgewiesen genommen 0.000065 \n", - "september dezember engländer 0.007255 \n", - "frankreich französischen lamennais 0.020327 \n", - "juli märz april 0.000065 \n", - "deutschland oesterreich glauben 0.118366 \n", - "leipzig fremden ward 0.217059 \n", - "\n", - " Grenzboten_1844_Tagebuch_88 \\\n", - "lassen weise finden 0.137258 \n", - "kaiser berlin friedrich 0.081019 \n", - "hiesigen scheint steht 0.070475 \n", - "presse volk eigentlich 0.068717 \n", - "geschütze abgewiesen genommen 0.000176 \n", - "september dezember engländer 0.000176 \n", - "frankreich französischen lamennais 0.000176 \n", - "juli märz april 0.000176 \n", - "deutschland oesterreich glauben 0.567838 \n", - "leipzig fremden ward 0.073989 \n", - "\n", - " Grenzboten_1845_Tagebuch_52 \\\n", - "lassen weise finden 0.200840 \n", - "kaiser berlin friedrich 0.051150 \n", - "hiesigen scheint steht 0.167251 \n", - "presse volk eigentlich 0.115407 \n", - "geschütze abgewiesen genommen 0.000037 \n", - "september dezember engländer 0.006243 \n", - "frankreich französischen lamennais 0.027054 \n", - "juli märz april 0.000037 \n", - "deutschland oesterreich glauben 0.235159 \n", - "leipzig fremden ward 0.196824 \n", - "\n", - " Grenzboten_1845_Tagebuch_62 \\\n", - "lassen weise finden 0.275677 \n", - "kaiser berlin friedrich 0.040945 \n", - "hiesigen scheint steht 0.135475 \n", - "presse volk eigentlich 0.056877 \n", - "geschütze abgewiesen genommen 0.000053 \n", - "september dezember engländer 0.000053 \n", - "frankreich französischen lamennais 0.030855 \n", - "juli märz april 0.000053 \n", - "deutschland oesterreich glauben 0.101487 \n", - "leipzig fremden ward 0.358524 \n", - "\n", - " Grenzboten_1845_Tagebuch_81 \\\n", - "lassen weise finden 0.189388 \n", - "kaiser berlin friedrich 0.019600 \n", - "hiesigen scheint steht 0.459051 \n", - "presse volk eigentlich 0.116146 \n", - "geschütze abgewiesen genommen 0.004203 \n", - "september dezember engländer 0.004203 \n", - "frankreich französischen lamennais 0.026259 \n", - "juli märz april 0.000042 \n", - "deutschland oesterreich glauben 0.138202 \n", - "leipzig fremden ward 0.042905 \n", - "\n", - " Grenzboten_1845_Tagebuch_85 \\\n", - "lassen weise finden 0.265710 \n", - "kaiser berlin friedrich 0.097651 \n", - "hiesigen scheint steht 0.342432 \n", - "presse volk eigentlich 0.067380 \n", - "geschütze abgewiesen genommen 0.000574 \n", - "september dezember engländer 0.009447 \n", - "frankreich französischen lamennais 0.027192 \n", - "juli märz april 0.001618 \n", - "deutschland oesterreich glauben 0.130532 \n", - "leipzig fremden ward 0.057463 \n", - "\n", - " Grenzboten_1845_Tagebuch_93 \\\n", - "lassen weise finden 0.588489 \n", - "kaiser berlin friedrich 0.050868 \n", - "hiesigen scheint steht 0.067588 \n", - "presse volk eigentlich 0.052154 \n", - "geschütze abgewiesen genommen 0.001994 \n", - "september dezember engländer 0.012926 \n", - "frankreich französischen lamennais 0.033505 \n", - "juli märz april 0.001994 \n", - "deutschland oesterreich glauben 0.140900 \n", - "leipzig fremden ward 0.049582 \n", - "\n", - " ... \\\n", - "lassen weise finden ... \n", - "kaiser berlin friedrich ... \n", - "hiesigen scheint steht ... \n", - "presse volk eigentlich ... \n", - "geschütze abgewiesen genommen ... \n", - "september dezember engländer ... \n", - "frankreich französischen lamennais ... \n", - "juli märz april ... \n", - "deutschland oesterreich glauben ... \n", - "leipzig fremden ward ... \n", - "\n", - " Grenzboten_1915_Kriegstagebuch_33 \\\n", - "lassen weise finden 0.009867 \n", - "kaiser berlin friedrich 0.000089 \n", - "hiesigen scheint steht 0.000089 \n", - "presse volk eigentlich 0.004533 \n", - "geschütze abgewiesen genommen 0.740533 \n", - "september dezember engländer 0.178756 \n", - "frankreich französischen lamennais 0.000089 \n", - "juli märz april 0.063200 \n", - "deutschland oesterreich glauben 0.002756 \n", - "leipzig fremden ward 0.000089 \n", - "\n", - " Grenzboten_1915_Kriegstagebuch_39 \\\n", - "lassen weise finden 0.000148 \n", - "kaiser berlin friedrich 0.000148 \n", - "hiesigen scheint steht 0.003116 \n", - "presse volk eigentlich 0.007567 \n", - "geschütze abgewiesen genommen 0.862166 \n", - "september dezember engländer 0.069881 \n", - "frankreich französischen lamennais 0.000148 \n", - "juli märz april 0.053561 \n", - "deutschland oesterreich glauben 0.000148 \n", - "leipzig fremden ward 0.003116 \n", - "\n", - " Grenzboten_1915_Kriegstagebuch_73 \\\n", - "lassen weise finden 0.014761 \n", - "kaiser berlin friedrich 0.000208 \n", - "hiesigen scheint steht 0.000208 \n", - "presse volk eigentlich 0.002287 \n", - "geschütze abgewiesen genommen 0.698753 \n", - "september dezember engländer 0.068815 \n", - "frankreich französischen lamennais 0.004366 \n", - "juli märz april 0.208108 \n", - "deutschland oesterreich glauben 0.000208 \n", - "leipzig fremden ward 0.002287 \n", - "\n", - " Grenzboten_1915_Kriegstagebuch_94 \\\n", - "lassen weise finden 0.000330 \n", - "kaiser berlin friedrich 0.000330 \n", - "hiesigen scheint steht 0.000330 \n", - "presse volk eigentlich 0.000330 \n", - "geschütze abgewiesen genommen 0.772607 \n", - "september dezember engländer 0.079538 \n", - "frankreich französischen lamennais 0.000330 \n", - "juli märz april 0.138944 \n", - "deutschland oesterreich glauben 0.000330 \n", - "leipzig fremden ward 0.006931 \n", - "\n", - " Grenzboten_1915_Kriegstagebuch_99 \\\n", - "lassen weise finden 0.000176 \n", - "kaiser berlin friedrich 0.023023 \n", - "hiesigen scheint steht 0.000176 \n", - "presse volk eigentlich 0.000176 \n", - "geschütze abgewiesen genommen 0.638137 \n", - "september dezember engländer 0.084534 \n", - "frankreich französischen lamennais 0.000176 \n", - "juli märz april 0.253251 \n", - "deutschland oesterreich glauben 0.000176 \n", - "leipzig fremden ward 0.000176 \n", - "\n", - " Grenzboten_1916_Kriegstagebuch_41 \\\n", - "lassen weise finden 0.004952 \n", - "kaiser berlin friedrich 0.000160 \n", - "hiesigen scheint steht 0.000160 \n", - "presse volk eigentlich 0.000160 \n", - "geschütze abgewiesen genommen 0.273323 \n", - "september dezember engländer 0.054473 \n", - "frankreich französischen lamennais 0.000160 \n", - "juli märz april 0.655112 \n", - "deutschland oesterreich glauben 0.000160 \n", - "leipzig fremden ward 0.011342 \n", - "\n", - " Grenzboten_1916_Kriegstagebuch_48 \\\n", - "lassen weise finden 0.025039 \n", - "kaiser berlin friedrich 0.001711 \n", - "hiesigen scheint steht 0.000156 \n", - "presse volk eigentlich 0.000156 \n", - "geschütze abgewiesen genommen 0.364075 \n", - "september dezember engländer 0.032815 \n", - "frankreich französischen lamennais 0.000156 \n", - "juli märz april 0.575583 \n", - "deutschland oesterreich glauben 0.000156 \n", - "leipzig fremden ward 0.000156 \n", - "\n", - " Grenzboten_1916_Kriegstagebuch_49 \\\n", - "lassen weise finden 0.000122 \n", - "kaiser berlin friedrich 0.000122 \n", - "hiesigen scheint steht 0.001345 \n", - "presse volk eigentlich 0.000122 \n", - "geschütze abgewiesen genommen 0.372983 \n", - "september dezember engländer 0.033130 \n", - "frankreich französischen lamennais 0.001345 \n", - "juli märz april 0.590587 \n", - "deutschland oesterreich glauben 0.000122 \n", - "leipzig fremden ward 0.000122 \n", - "\n", - " Grenzboten_1916_Kriegstagebuch_69 \\\n", - "lassen weise finden 0.002804 \n", - "kaiser berlin friedrich 0.001469 \n", - "hiesigen scheint steht 0.000134 \n", - "presse volk eigentlich 0.000134 \n", - "geschütze abgewiesen genommen 0.181709 \n", - "september dezember engländer 0.086916 \n", - "frankreich französischen lamennais 0.000134 \n", - "juli märz april 0.726435 \n", - "deutschland oesterreich glauben 0.000134 \n", - "leipzig fremden ward 0.000134 \n", - "\n", - " Grenzboten_1916_Kriegstagebuch_81 \n", - "lassen weise finden 0.000134 \n", - "kaiser berlin friedrich 0.000134 \n", - "hiesigen scheint steht 0.001478 \n", - "presse volk eigentlich 0.004167 \n", - "geschütze abgewiesen genommen 0.234005 \n", - "september dezember engländer 0.057930 \n", - "frankreich französischen lamennais 0.000134 \n", - "juli märz april 0.697715 \n", - "deutschland oesterreich glauben 0.002823 \n", - "leipzig fremden ward 0.001478 \n", - "\n", - "[10 rows x 30 columns]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "doc_topics = preprocessing.lda_doc_topic(model, topics, document_labels)\n", - "doc_topics" + "#document_topics = postprocessing.show_document_topics(model, topics, metadata['title'])\n", + "#document_topics" ] }, { @@ -3707,369 +1185,17 @@ "source": [ "#### Distribution of topics over all documents\n", "\n", - "The distribution of topics over all documents can now be visualized in a heat map." + "The distribution of topics over all documents can now be visualized in a heatmap." ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " Loading BokehJS ...\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "\n", - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " var force = true;\n", - "\n", - " if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - " var JS_MIME_TYPE = 'application/javascript';\n", - " var HTML_MIME_TYPE = 'text/html';\n", - " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " var CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " var script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " var cell = handle.cell;\n", - "\n", - " var id = cell.output_area._bokeh_element_id;\n", - " var server_id = cell.output_area._bokeh_server_id;\n", - " // Clean up Bokeh references\n", - " if (id !== undefined) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " var cmd = \"from bokeh.io import _state; print(_state.uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " var element_id = msg.content.text.trim();\n", - " Bokeh.index[element_id].model.document.clear();\n", - " delete Bokeh.index[element_id];\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " var cmd = \"from bokeh import io; io._destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " var output_area = handle.output_area;\n", - " var output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " var toinsert = output_area.element.find(`.${CLASS_NAME.split(' ')[0]}`);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[0].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " var bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " var script_attrs = bk_div.children[0].attributes;\n", - " for (var i = 0; i < script_attrs.length; i++) {\n", - " toinsert[0].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " var toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[0]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " var events = require('base/js/events');\n", - " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - "\n", - " \n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " var NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " var el = document.getElementById(\"ae5da312-f172-401e-83ce-2d4489192a95\");\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", - " }\n", - " finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.info(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(js_urls, callback) {\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = js_urls.length;\n", - " for (var i = 0; i < js_urls.length; i++) {\n", - " var url = js_urls[i];\n", - " var s = document.createElement('script');\n", - " s.src = url;\n", - " s.async = false;\n", - " s.onreadystatechange = s.onload = function() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", - " run_callbacks()\n", - " }\n", - " };\n", - " s.onerror = function() {\n", - " console.warn(\"failed to load library \" + url);\n", - " };\n", - " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", - " }\n", - " };var element = document.getElementById(\"ae5da312-f172-401e-83ce-2d4489192a95\");\n", - " if (element == null) {\n", - " console.log(\"Bokeh: ERROR: autoload.js configured with elementid 'ae5da312-f172-401e-83ce-2d4489192a95' but no matching script tag was found. \")\n", - " return false;\n", - " }\n", - "\n", - " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.9.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.9.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.9.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-0.12.9.min.js\"];\n", - "\n", - " var inline_js = [\n", - " function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - " \n", - " function(Bokeh) {\n", - " \n", - " },\n", - " function(Bokeh) {\n", - " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.9.min.css\");\n", - " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.9.min.css\");\n", - " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.9.min.css\");\n", - " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.9.min.css\");\n", - " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.9.min.css\");\n", - " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.9.min.css\");\n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " \n", - " if ((root.Bokeh !== undefined) || (force === true)) {\n", - " for (var i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }if (force === true) {\n", - " display_loaded();\n", - " }} else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " var cell = $(document.getElementById(\"ae5da312-f172-401e-83ce-2d4489192a95\")).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - "\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(js_urls, function() {\n", - " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"ae5da312-f172-401e-83ce-2d4489192a95\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n }\n finally {\n delete root._bokeh_onload_callbacks\n }\n console.info(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(js_urls, callback) {\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = js_urls.length;\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var s = document.createElement('script');\n s.src = url;\n s.async = false;\n s.onreadystatechange = s.onload = function() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: all BokehJS libraries loaded\");\n run_callbacks()\n }\n };\n s.onerror = function() {\n console.warn(\"failed to load library \" + url);\n };\n console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.getElementsByTagName(\"head\")[0].appendChild(s);\n }\n };var element = document.getElementById(\"ae5da312-f172-401e-83ce-2d4489192a95\");\n if (element == null) {\n console.log(\"Bokeh: ERROR: autoload.js configured with elementid 'ae5da312-f172-401e-83ce-2d4489192a95' but no matching script tag was found. \")\n return false;\n }\n\n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.9.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.9.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.9.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-0.12.9.min.js\"];\n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n \n function(Bokeh) {\n \n },\n function(Bokeh) {\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.9.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.9.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.9.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.9.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.9.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.9.min.css\");\n }\n ];\n\n function run_inline_js() {\n \n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"ae5da312-f172-401e-83ce-2d4489192a95\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(js_urls, function() {\n console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function embed_document(root) {\n", - " var docs_json = {\"a52734d4-2460-4f3e-9980-98f4e66392be\":{\"roots\":{\"references\":[{\"attributes\":{},\"id\":\"d9f5dca5-a6f3-4fdc-a8cf-010a3ff7465e\",\"type\":\"CategoricalTicker\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"width\":{\"units\":\"data\",\"value\":1},\"x\":{\"field\":\"Document\"},\"y\":{\"field\":\"Topic\"}},\"id\":\"1e7ef23b-8e77-4a2a-9bdc-bad0757cb637\",\"type\":\"Rect\"},{\"attributes\":{\"dimension\":1,\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"d9f5dca5-a6f3-4fdc-a8cf-010a3ff7465e\",\"type\":\"CategoricalTicker\"}},\"id\":\"f72ea21a-6831-48e4-9555-8b78e51bdf50\",\"type\":\"Grid\"},{\"attributes\":{\"high\":0.8621661721068248,\"low\":2.4521824423737128e-05,\"palette\":[\"#c6dbef\",\"#9ecae1\",\"#6baed6\",\"#4292c6\",\"#2171b5\",\"#08519c\",\"#08306b\"]},\"id\":\"8980aa08-a1db-4af6-baa8-bb1e48191338\",\"type\":\"LinearColorMapper\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"Document\",\"Score\",\"Topic\",\"index\"],\"data\":{\"Document\":[\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\",\"Grenzboten_1916_Kriegstagebuch_81\"],\"Score\":{\"__ndarray__\":\"t+422emK0D8vIR3jiyzJP/2E40S14sI/MLJkXVu+oj9AmsHaqMUkP8Pjs21oA5I/UkcLFwyJjT9AmsHaqMUkP0AJemDZM8s/uWlTlDSovT+T8g99WwPMP9m0e9TJEtE/Yn0yWqdgwz9HHJId7nirP7Divf9ThxA/2KynN8oUhj9sAbIXpX+DP7Divf9ThxA/tHMALRPjzz/ghK4FyF6kP8sgoA1B/s0/gTPrw8aD0D/uCuBZwFS/P66NY6n5/rI/+lpbQbZuEz/+mCl4cvhuPw+r7Zq+Mp8/+lpbQbZuEz/KbkHkUjjEP/0Ur3PYYb4/SOF6FK5H0T9rJ+OeWhayPwOLE5wkrcU/PXCj1gk9wD94ZlVEMyIRP7cdhOpQt30/e78DSIzQlD94ZlVEMyIRP/gIGis8Tb4/yfsuYpXIyz91EdZ5rpHBP81r+tWuvbQ/dxlpMJ4Ksj++i3u/cJexP/wk6ZUWCSc//CTplRYJJz/8JOmVFgknP/wk6ZUWCSc/nZjgQLsr4j/pNEQS+fCyP1CU/rMdtck/7OvNCFcwqj/qZzeUeWjFP73tro5Ri70/r5QS4j4kAz+i0gQCbJJ5Py/kLwL1s5s/r5QS4j4kAz/vV2AirxnOP5FUrIOEMck/qib+m7Gk0T8VVk5JyPakP6n9+TpBV8E/wEvyRwYfrT+P4znZ3dcLP4/jOdnd1ws/ujAAP3WYnz+P4znZ3dcLPzbkESIN+7k/wOwbGw3y1j8vCIDp3z3IP+yLNz8nEpQ/K6XIORhh3T/A8nGkxru9P4Wd0Y4/N3E/hZ3Rjj83cT89xZkhmeOaP88dB25s0QU/G6X1eZywwT9DqsfrmvelP2TiTrpjAdE/ELiy6K3/uD+caayJaOrVP1/rEh3QP7E/K5zRtQHQQj9s1p4201iDP98nkbY82Js/PBaQ9DCCWj/JhqnKSLXAP1ItiPLZa60/WaPBWebU4j+dwxV1ZAuqP67ei5h5TbE/q2B4SPmzqj8FNydH0VRgP7Sp4mT4eIo/V3tiT4snoT8FNydH0VRgP2gtg5AFCcI/jiazoc9iqT+Apk4n0gvLP1u4oT9BSKE/pjzIz0VFuD+5cutLQ3O4P0dX96xqS0k/BW7i2mRlEj/t2zuGyOepPwVu4tpkZRI/ZO3tmoiO3T8TSIJ+w0OqP+0TLrxZKNE/o4DAMRp01D/ymclQLq23P/htoI27Urw/fZ4fNS5uPj/m/hb4lSEGP5ZGJpH0XKc/5v4W+JUhBj86951XIs/AP3HdsrwhO6A/K5NhfJDfyj/OOdzbMlGcP+42ImQbvME/VEdJeddG3T9lT5aoqr1xP58S5ClBnoI/4k5DBseRkj+VIg398CdBP+hOCtEOoLg/tTcCvRSFoz8QN0++I8PQPxVT2q6zlqQ/I/zjdA9wrD8bxM2T78i4PzKl7TprSWQ/Ao5F+MfpHj82iJsn35GhPzKl7TprSWQ/PsCxCP84wT/vAY5F+MfXPxTN4zDnM88/liogpXBelT9iIyVDejm1PweajktlCrY/8C9p8Ya2+T7gPSCUFzd6P6jl4vwTDNU/8C9p8Ya2+T6ux1B1QIPHP5KN3nt5Pqg/v1cjCXw1sj/oUoehLnWYP3GgtkEHaos/fzcTAXgzMT/OTQ3t3NTAPxyobdCB2uY/6FKHoS51mD+QUTEhGRWTP09segHFpmc/fzcTAXgzMT+cjmfyknRpP/efm45n8qI/rDbXuF1MeD/9fgWZU4MyP5Y7KvZZhdc/pldtrnG74D9iwuA7TYGNP/1+BZlTgzI/CzKnBiWAoT9iwuA7TYGNPwN8yu/0GIU/Naa+aRfXlD8cMLdoQz6KPzbko+GRY48/hdQCRyQx2z96jAp9nojfPwN8yu/0GIU/HDC3aEM+ij+Gc/WCYXcwP4Zz9YJhdzA/h47hxXnomD9UHEFmTMWBP8XWQehxbI0/Lg2x8+rSoD+6GiN6qavRPyIsxq0qwuI/OXehFZVzlz/F1kHocWyNP7ECYaQdK5A//SCBieAPcj+OE8k4kYyDP8wdoNwByj0/3Ry7zbHbjD/MHaDcAco9P97m2G2O3d4/ahWTVjFp3T/MHaDcAco9P8wdoNwByj0/3Ry7zbHbjD8Wk1YxaRWTP3IeO5X5NIQ/KFineztNFz8oWKd7O00XPz9aiWaLkXI/D3cI9nKy5z8wcotIduHGPyhYp3s7TRc/dXEbDeAtsD9nHcqf0ZJmPyhYp3s7TRc/dp75AWdyIz92nvkBZ3IjP/qflzInhmk/itQlK1T+fj9+Uh2D3ZbrP0UhUcK947E/dp75AWdyIz90AMlBU2yrP3ae+QFnciM/+p+XMieGaT+u40Fx+DqOP/+zATD5Pys//7MBMPk/Kz/AKwFR+7tiP8Nl9GguXOY/3RmJmNudsT8fHoGH++FxPzOqLVdJo8o//7MBMPk/Kz/AKwFR+7tiP1C1rfQIoTU/ULWt9AihNT9Qta30CKE1P1C1rfQIoTU/YA+v3DK56D+xh1dumVy0P1C1rfQIoTU/ltUw3enIwT9Qta30CKE1P/n9I8FbY3w/+yTplRYJJz/Ym2wdTZOXP/sk6ZUWCSc/+yTplRYJJz84XYR1nmvkPz2H1bcJpLU/+yTplRYJJz8kqzoERTXQP/sk6ZUWCSc/+yTplRYJJz/IjwX2oEh0PxlKNwYi8CQ/GUo3BiLwJD8ZSjcGIvAkPwdx8WweftE/s6VHUt3jqz8ZSjcGIvAkP2A72RCt9uQ/GUo3BiLwJD8zVuW+ZTqHP1tVMM/Ko5k/81aEXlMHXD8OhXcta2IkPw6Fdy1rYiQ/dr7Ywv9M1z+qgnlWHs2gPw6Fdy1rYiQ/aQ6Fdy1r4j8OhXcta2IkPw6Fdy1rYiQ/HlHYQAIGID8eUdhAAgYgP4lvKRlDCFY/HlHYQAIGID/YQVqb897XP9/9pGJc9qA/iW8pGUMIVj8sIltIFubiPx5R2EACBiA/HlHYQAIGID+yrTOM3fdmP64jNujbD1g/xF8nwOV/IT/EXyfA5X8hPwhVpBw9Qsc/QxDSnx5Atj/EXyfA5X8hP6dNkCH1Puc/xF8nwOV/IT/EXyfA5X8hP54R4BkBniE/nhHgGQGeIT85GJSDQTlYPxAREREREXE/8jU+X+PzzT+oPZDaA6mtP54R4BkBniE/U+g6ha5T5j8fF/ZxYR9nPzkYlINBOVg/\",\"dtype\":\"float64\",\"shape\":[300]},\"Topic\":[\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\",\"lassen weise finden\",\"kaiser berlin friedrich\",\"hiesigen scheint steht\",\"presse volk eigentlich\",\"gesch\\u00fctze abgewiesen genommen\",\"september dezember engl\\u00e4nder\",\"frankreich franz\\u00f6sischen lamennais\",\"juli m\\u00e4rz april\",\"deutschland oesterreich glauben\",\"leipzig fremden ward\"],\"index\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299]}},\"id\":\"d4ff694d-aae2-47a4-a80f-79f970412430\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"0a373ec3-94e5-4ce7-9282-7d1a55abd2e0\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"above\":[{\"id\":\"5a612a5a-e55e-4515-99b2-8c95b6f70765\",\"type\":\"CategoricalAxis\"}],\"left\":[{\"id\":\"981fbe08-4554-416f-87aa-95a94201bf01\",\"type\":\"CategoricalAxis\"}],\"plot_height\":768,\"plot_width\":1024,\"renderers\":[{\"id\":\"5a612a5a-e55e-4515-99b2-8c95b6f70765\",\"type\":\"CategoricalAxis\"},{\"id\":\"3ecc04df-28f9-4d43-a7db-6fb5818ab815\",\"type\":\"Grid\"},{\"id\":\"981fbe08-4554-416f-87aa-95a94201bf01\",\"type\":\"CategoricalAxis\"},{\"id\":\"f72ea21a-6831-48e4-9555-8b78e51bdf50\",\"type\":\"Grid\"},{\"id\":\"44daf9f4-3462-43e4-bcab-1742d6c977c9\",\"type\":\"BoxAnnotation\"},{\"id\":\"03b5f777-5a00-4168-8aac-8b3d2518ae2e\",\"type\":\"GlyphRenderer\"},{\"id\":\"931f6e70-3635-47fe-878d-485a37cfa79e\",\"type\":\"ColorBar\"}],\"right\":[{\"id\":\"931f6e70-3635-47fe-878d-485a37cfa79e\",\"type\":\"ColorBar\"}],\"sizing_mode\":\"scale_width\",\"title\":{\"id\":\"5018e278-06ca-40a1-a93e-3a91b8ae2656\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1f66849c-bc1d-423d-882d-41d1fc128fd5\",\"type\":\"Toolbar\"},\"toolbar_location\":\"below\",\"x_range\":{\"id\":\"f336331d-83c8-4d8a-a20a-8ecc553de8b0\",\"type\":\"FactorRange\"},\"x_scale\":{\"id\":\"08e96640-f779-4b9b-9be9-2cc97b1af99c\",\"type\":\"CategoricalScale\"},\"y_range\":{\"id\":\"a6f29578-79f8-4e38-8d89-398d383d832e\",\"type\":\"FactorRange\"},\"y_scale\":{\"id\":\"b42d5943-26fe-497a-a544-55732d4ef5fc\",\"type\":\"CategoricalScale\"}},\"id\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"callback\":null,\"tooltips\":[[\"Document\",\"@Document\"],[\"Topic\",\"@Topic\"],[\"Score\",\"@Score\"]]},\"id\":\"11cc72f7-7275-45bd-81fb-6d947bb36c82\",\"type\":\"HoverTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"11cc72f7-7275-45bd-81fb-6d947bb36c82\",\"type\":\"HoverTool\"},{\"id\":\"2b461b9d-b9a0-4f55-802c-ea31e041f6ce\",\"type\":\"SaveTool\"},{\"id\":\"3ac9c38b-0ac7-411e-ac6c-14c266496fd8\",\"type\":\"PanTool\"},{\"id\":\"a83eaa2d-9ac2-4a8b-b472-210b9965c2ef\",\"type\":\"BoxZoomTool\"},{\"id\":\"1b72514c-cb22-4c03-800d-09ee3371cf66\",\"type\":\"ResetTool\"},{\"id\":\"b0ef7ce5-b856-4224-a451-3bf98392c884\",\"type\":\"WheelZoomTool\"}]},\"id\":\"1f66849c-bc1d-423d-882d-41d1fc128fd5\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"fb1a1301-ad36-4ac5-b8f2-36f146823982\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"d4ff694d-aae2-47a4-a80f-79f970412430\",\"type\":\"ColumnDataSource\"}},\"id\":\"8410a135-3f94-4c59-a1e5-7ee2d6115c87\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2b461b9d-b9a0-4f55-802c-ea31e041f6ce\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"3ac9c38b-0ac7-411e-ac6c-14c266496fd8\",\"type\":\"PanTool\"},{\"attributes\":{\"callback\":null,\"factors\":[\"Grenzboten_1844_Tagebuch_56\",\"Grenzboten_1844_Tagebuch_70\",\"Grenzboten_1844_Tagebuch_77\",\"Grenzboten_1844_Tagebuch_82\",\"Grenzboten_1844_Tagebuch_88\",\"Grenzboten_1845_Tagebuch_52\",\"Grenzboten_1845_Tagebuch_62\",\"Grenzboten_1845_Tagebuch_81\",\"Grenzboten_1845_Tagebuch_85\",\"Grenzboten_1845_Tagebuch_93\",\"Grenzboten_1846_Tagebuch_51\",\"Grenzboten_1846_Tagebuch_72\",\"Grenzboten_1846_Tagebuch_82\",\"Grenzboten_1846_Tagebuch_88\",\"Grenzboten_1846_Tagebuch_96\",\"Grenzboten_1914_Kriegstagebuch_37\",\"Grenzboten_1914_Kriegstagebuch_68\",\"Grenzboten_1914_Kriegstagebuch_94\",\"Grenzboten_1914_Kriegstagebuch_95\",\"Grenzboten_1914_Kriegstagebuch_97\",\"Grenzboten_1915_Kriegstagebuch_33\",\"Grenzboten_1915_Kriegstagebuch_39\",\"Grenzboten_1915_Kriegstagebuch_73\",\"Grenzboten_1915_Kriegstagebuch_94\",\"Grenzboten_1915_Kriegstagebuch_99\",\"Grenzboten_1916_Kriegstagebuch_41\",\"Grenzboten_1916_Kriegstagebuch_48\",\"Grenzboten_1916_Kriegstagebuch_49\",\"Grenzboten_1916_Kriegstagebuch_69\",\"Grenzboten_1916_Kriegstagebuch_81\"]},\"id\":\"f336331d-83c8-4d8a-a20a-8ecc553de8b0\",\"type\":\"FactorRange\"},{\"attributes\":{\"overlay\":{\"id\":\"44daf9f4-3462-43e4-bcab-1742d6c977c9\",\"type\":\"BoxAnnotation\"}},\"id\":\"a83eaa2d-9ac2-4a8b-b472-210b9965c2ef\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"08e96640-f779-4b9b-9be9-2cc97b1af99c\",\"type\":\"CategoricalScale\"},{\"attributes\":{},\"id\":\"1b72514c-cb22-4c03-800d-09ee3371cf66\",\"type\":\"ResetTool\"},{\"attributes\":{\"callback\":null,\"factors\":[\"leipzig fremden ward\",\"deutschland oesterreich glauben\",\"juli m\\u00e4rz april\",\"frankreich franz\\u00f6sischen lamennais\",\"september dezember engl\\u00e4nder\",\"gesch\\u00fctze abgewiesen genommen\",\"presse volk eigentlich\",\"hiesigen scheint steht\",\"kaiser berlin friedrich\",\"lassen weise finden\"]},\"id\":\"a6f29578-79f8-4e38-8d89-398d383d832e\",\"type\":\"FactorRange\"},{\"attributes\":{},\"id\":\"14df591d-55f9-49b1-860c-6f2f5b895dae\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{},\"id\":\"b42d5943-26fe-497a-a544-55732d4ef5fc\",\"type\":\"CategoricalScale\"},{\"attributes\":{},\"id\":\"b0ef7ce5-b856-4224-a451-3bf98392c884\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"d4ff694d-aae2-47a4-a80f-79f970412430\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"4b300400-4e37-46c9-8356-df0ea290ff6a\",\"type\":\"Rect\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1e7ef23b-8e77-4a2a-9bdc-bad0757cb637\",\"type\":\"Rect\"},\"selection_glyph\":null,\"view\":{\"id\":\"8410a135-3f94-4c59-a1e5-7ee2d6115c87\",\"type\":\"CDSView\"}},\"id\":\"03b5f777-5a00-4168-8aac-8b3d2518ae2e\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_line_color\":{\"value\":null},\"formatter\":{\"id\":\"fb1a1301-ad36-4ac5-b8f2-36f146823982\",\"type\":\"CategoricalTickFormatter\"},\"major_label_standoff\":0,\"major_label_text_font_size\":{\"value\":\"9pt\"},\"major_tick_line_color\":{\"value\":null},\"plot\":{\"id\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"d9f5dca5-a6f3-4fdc-a8cf-010a3ff7465e\",\"type\":\"CategoricalTicker\"}},\"id\":\"981fbe08-4554-416f-87aa-95a94201bf01\",\"type\":\"CategoricalAxis\"},{\"attributes\":{\"axis_line_color\":{\"value\":null},\"formatter\":{\"id\":\"14df591d-55f9-49b1-860c-6f2f5b895dae\",\"type\":\"CategoricalTickFormatter\"},\"major_label_orientation\":1.0471975511965976,\"major_label_standoff\":0,\"major_label_text_font_size\":{\"value\":\"9pt\"},\"major_tick_line_color\":{\"value\":null},\"plot\":{\"id\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"97b549bb-7bbc-4a08-b311-71fbf3774e3f\",\"type\":\"CategoricalTicker\"}},\"id\":\"5a612a5a-e55e-4515-99b2-8c95b6f70765\",\"type\":\"CategoricalAxis\"},{\"attributes\":{\"plot\":null,\"text\":\"Grenzbote\"},\"id\":\"5018e278-06ca-40a1-a93e-3a91b8ae2656\",\"type\":\"Title\"},{\"attributes\":{\"desired_num_ticks\":7},\"id\":\"8dea744a-1cf6-4c25-b5f2-d60082049c5a\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"97b549bb-7bbc-4a08-b311-71fbf3774e3f\",\"type\":\"CategoricalTicker\"},{\"attributes\":{\"color_mapper\":{\"id\":\"8980aa08-a1db-4af6-baa8-bb1e48191338\",\"type\":\"LinearColorMapper\"},\"formatter\":{\"id\":\"0a373ec3-94e5-4ce7-9282-7d1a55abd2e0\",\"type\":\"BasicTickFormatter\"},\"label_standoff\":6,\"location\":[0,0],\"major_label_text_font_size\":{\"value\":\"10pt\"},\"plot\":{\"id\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"8dea744a-1cf6-4c25-b5f2-d60082049c5a\",\"type\":\"BasicTicker\"}},\"id\":\"931f6e70-3635-47fe-878d-485a37cfa79e\",\"type\":\"ColorBar\"},{\"attributes\":{\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"97b549bb-7bbc-4a08-b311-71fbf3774e3f\",\"type\":\"CategoricalTicker\"}},\"id\":\"3ecc04df-28f9-4d43-a7db-6fb5818ab815\",\"type\":\"Grid\"},{\"attributes\":{\"fill_color\":{\"field\":\"Score\",\"transform\":{\"id\":\"8980aa08-a1db-4af6-baa8-bb1e48191338\",\"type\":\"LinearColorMapper\"}},\"height\":{\"units\":\"data\",\"value\":1},\"line_color\":{\"value\":null},\"width\":{\"units\":\"data\",\"value\":1},\"x\":{\"field\":\"Document\"},\"y\":{\"field\":\"Topic\"}},\"id\":\"4b300400-4e37-46c9-8356-df0ea290ff6a\",\"type\":\"Rect\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"44daf9f4-3462-43e4-bcab-1742d6c977c9\",\"type\":\"BoxAnnotation\"}],\"root_ids\":[\"cc7132b0-9bb9-4798-abac-4d592927ac73\"]},\"title\":\"Bokeh Application\",\"version\":\"0.12.9\"}};\n", - " var render_items = [{\"docid\":\"a52734d4-2460-4f3e-9980-98f4e66392be\",\"elementid\":\"8f296c22-f954-43e7-999f-483734edeac1\",\"modelid\":\"cc7132b0-9bb9-4798-abac-4d592927ac73\",\"notebook_comms_target\":\"ad9a8eb6-48ba-45f8-aa53-9b480cca0f1c\"}];\n", - "\n", - " root.Bokeh.embed.embed_items(docs_json, render_items);\n", - " }\n", - "\n", - " if (root.Bokeh !== undefined) {\n", - " embed_document(root);\n", - " } else {\n", - " var attempts = 0;\n", - " var timer = setInterval(function(root) {\n", - " if (root.Bokeh !== undefined) {\n", - " embed_document(root);\n", - " clearInterval(timer);\n", - " }\n", - " attempts++;\n", - " if (attempts > 100) {\n", - " console.log(\"Bokeh: ERROR: Unable to embed document because BokehJS library is missing\")\n", - " clearInterval(timer);\n", - " }\n", - " }, 10, root)\n", - " }\n", - "})(window);" - ], - "application/vnd.bokehjs_exec.v0+json": "" - }, - "metadata": { - "application/vnd.bokehjs_exec.v0+json": { - "id": "cc7132b0-9bb9-4798-abac-4d592927ac73" - } - }, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

<Bokeh Notebook handle for In[24]>

" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "plot = visualization.doc_topic_heatmap_interactive(doc_topics, title=\"Grenzbote\")\n", - "show(plot, notebook_handle=True)" + "#plot = visualization.plot_document_topics_heatmap(doc_topics, interactive=True)\n", + "#show(plot, notebook_handle=True)" ] }, { @@ -4083,24 +1209,13 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "visualization.plot_doc_topics(doc_topics, 0)" + "#visualization.plot_doc_topics(doc_topics, 0)" ] } ],