diff --git a/IntegrationTest_v01.ipynb b/IntegrationTest_v01.ipynb index bfc112a..4a61a4a 100644 --- a/IntegrationTest_v01.ipynb +++ b/IntegrationTest_v01.ipynb @@ -11,21 +11,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "31-Jan-2017 15:42:24 DEBUG gensim.models.doc2vec: Fast version of gensim.models.doc2vec is being used\n", - "31-Jan-2017 15:42:24 INFO summa.preprocessing.cleaner: 'pattern' package not found; tag filters are not available for English\n", - "31-Jan-2017 15:42:24 INFO root: Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt\n", - "31-Jan-2017 15:42:24 INFO root: Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt\n", - "/usr/local/lib/python3.5/dist-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\gensim\\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n", + " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\funcy\\decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " spec = inspect.getargspec(func)\n", - "/usr/local/lib/python3.5/dist-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\funcy\\decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " spec = inspect.getargspec(func)\n", - "/usr/local/lib/python3.5/dist-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\funcy\\decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " spec = inspect.getargspec(func)\n", - "/usr/local/lib/python3.5/dist-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\funcy\\decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " spec = inspect.getargspec(func)\n", - "/usr/local/lib/python3.5/dist-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\funcy\\decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " spec = inspect.getargspec(func)\n", - "/usr/local/lib/python3.5/dist-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\funcy\\decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " spec = inspect.getargspec(func)\n" ] } @@ -62,18 +60,32 @@ "name": "stderr", "output_type": "stream", "text": [ - "31-Jan-2017 14:49:17 INFO preprocessing: Creating document list from TXT files ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: 17 entries in document list.\n" + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.\n", + " def _ipython_display_formatter_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:98: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.\n", + " def _formatters_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n", + " def _deferred_printers_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n", + " def _singleton_printers_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n", + " def _type_printers_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n", + " def _singleton_printers_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n", + " def _type_printers_default(self):\n", + "c:\\users\\philip\\appdata\\local\\programs\\python\\python35-32\\lib\\site-packages\\IPython\\core\\formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n", + " def _deferred_printers_default(self):\n" ] }, { "data": { "text/plain": [ - "['corpus_txt/Poe_EurekaAProsePoem.txt',\n", - " 'corpus_txt/Howard_TheDevilinIron.txt',\n", - " 'corpus_txt/Lovecraft_TheShunnedHouse.txt',\n", - " 'corpus_txt/Howard_SchadowsinZamboula.txt',\n", - " 'corpus_txt/Doyle_AStudyinScarlet.txt']" + "['corpus_txt\\\\Doyle_AScandalinBohemia.txt',\n", + " 'corpus_txt\\\\Doyle_AStudyinScarlet.txt',\n", + " 'corpus_txt\\\\Doyle_TheHoundoftheBaskervilles.txt',\n", + " 'corpus_txt\\\\Doyle_TheSignoftheFour.txt',\n", + " 'corpus_txt\\\\Howard_GodsoftheNorth.txt']" ] }, "execution_count": 2, @@ -97,22 +109,14 @@ "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:17 INFO preprocessing: Creating document list from CSV files ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: 16 entries in document list.\n" - ] - }, { "data": { "text/plain": [ - "['corpus_csv/Howard_GodsoftheNorth.txt.csv',\n", - " 'corpus_csv/Poe_EurekaAProsePoem.txt.csv',\n", - " 'corpus_csv/Poe_TheMasqueoftheRedDeath.txt.csv',\n", - " 'corpus_csv/Poe_ThePurloinedLetter.txt.csv',\n", - " 'corpus_csv/Howard_ShadowsintheMoonlight.txt.csv']" + "['corpus_csv\\\\Doyle_AStudyinScarlet.txt.csv',\n", + " 'corpus_csv\\\\Doyle_TheHoundoftheBaskervilles.txt.csv',\n", + " 'corpus_csv\\\\Doyle_TheSignoftheFour.txt.csv',\n", + " 'corpus_csv\\\\Howard_GodsoftheNorth.txt.csv',\n", + " 'corpus_csv\\\\Howard_SchadowsinZamboula.txt.csv']" ] }, "execution_count": 3, @@ -141,22 +145,14 @@ "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:17 INFO preprocessing: Creating document labels ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Document labels available\n" - ] - }, { "data": { "text/plain": [ - "['corpus_txt/Poe_EurekaAProsePoem.txt',\n", - " 'corpus_txt/Howard_TheDevilinIron.txt',\n", - " 'corpus_txt/Lovecraft_TheShunnedHouse.txt',\n", - " 'corpus_txt/Howard_SchadowsinZamboula.txt',\n", - " 'corpus_txt/Doyle_AStudyinScarlet.txt']" + "['corpus_txt\\\\Doyle_AScandalinBohemia.txt',\n", + " 'corpus_txt\\\\Doyle_AStudyinScarlet.txt',\n", + " 'corpus_txt\\\\Doyle_TheHoundoftheBaskervilles.txt',\n", + " 'corpus_txt\\\\Doyle_TheSignoftheFour.txt',\n", + " 'corpus_txt\\\\Howard_GodsoftheNorth.txt']" ] }, "execution_count": 4, @@ -212,1032 +208,1009 @@ "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Poe_EurekaAProsePoem.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Howard_TheDevilinIron.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Lovecraft_TheShunnedHouse.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Howard_SchadowsinZamboula.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Doyle_AStudyinScarlet.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Poe_TheCaskofAmontillado.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Poe_TheMasqueoftheRedDeath.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Howard_GodsoftheNorth.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Kipling_TheEndofthePassage.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Doyle_TheSignoftheFour.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Kipling_TheJungleBook.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Doyle_AScandalinBohemia.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Poe_ThePurloinedLetter.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Lovecraft_AttheMountainofMadness.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Kipling_ThyServantaDog.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Howard_ShadowsintheMoonlight.txt ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: Accessing TXT document corpus_txt/Doyle_TheHoundoftheBaskervilles.txt ...\n" - ] - }, { "data": { "text/plain": [ - "[['eureka',\n", - " 'a',\n", - " 'prose',\n", - " 'poem',\n", - " 'by',\n", - " 'edgar',\n", - " 'a',\n", - " 'poe',\n", - " 'new-york',\n", - " 'geo',\n", - " 'p',\n", - " 'putnam',\n", - " 'of',\n", - " 'late',\n", - " 'firm',\n", - " 'of',\n", - " 'wiley',\n", - " 'putnam',\n", - " 'broadway',\n", - " 'mdcccxlviii',\n", - " 'entered',\n", - " 'according',\n", - " 'to',\n", - " 'act',\n", - " 'of',\n", - " 'congress',\n", + "[['a',\n", + " 'scandal',\n", " 'in',\n", - " 'the',\n", - " 'year',\n", - " 'by',\n", - " 'edgar',\n", + " 'bohemia',\n", " 'a',\n", - " 'poe',\n", - " 'in',\n", - " 'the',\n", - " 'clerk’s',\n", - " 'office',\n", - " 'of',\n", - " 'the',\n", - " 'district',\n", - " 'court',\n", - " 'for',\n", - " 'the',\n", - " 'southern',\n", - " 'district',\n", - " 'of',\n", - " 'new-york',\n", - " 'leavitt',\n", - " 'trow',\n", - " 'co',\n", - " 'prs',\n", - " 'ann-street',\n", - " 'with',\n", - " 'very',\n", - " 'profound',\n", - " 'respect',\n", - " 'this',\n", - " 'work',\n", - " 'is',\n", - " 'dedicated',\n", - " 'to',\n", - " 'alexander',\n", - " 'von',\n", - " 'humboldt',\n", - " 'preface',\n", - " 'to',\n", - " 'the',\n", - " 'few',\n", - " 'who',\n", - " 'love',\n", - " 'me',\n", - " 'and',\n", - " 'whom',\n", + " 'conan',\n", + " 'doyle',\n", " 'i',\n", - " 'love—to',\n", - " 'those',\n", - " 'who',\n", - " 'feel',\n", - " 'rather',\n", - " 'than',\n", " 'to',\n", - " 'those',\n", - " 'who',\n", - " 'think—to',\n", + " 'sherlock',\n", + " 'holmes',\n", + " 'she',\n", + " 'is',\n", + " 'always',\n", " 'the',\n", - " 'dreamers',\n", - " 'and',\n", - " 'those',\n", - " 'who',\n", - " 'put',\n", - " 'faith',\n", - " 'in',\n", - " 'dreams',\n", - " 'as',\n", + " 'woman',\n", + " 'i',\n", + " 'have',\n", + " 'seldom',\n", + " 'heard',\n", + " 'him',\n", + " 'mention',\n", + " 'her',\n", + " 'under',\n", + " 'any',\n", + " 'other',\n", + " 'name',\n", " 'in',\n", + " 'his',\n", + " 'eyes',\n", + " 'she',\n", + " 'eclipses',\n", + " 'and',\n", + " 'predominates',\n", " 'the',\n", - " 'only',\n", - " 'realities—i',\n", - " 'offer',\n", - " 'this',\n", - " 'book',\n", + " 'whole',\n", " 'of',\n", - " 'truths',\n", + " 'her',\n", + " 'sex',\n", + " 'it',\n", + " 'was',\n", " 'not',\n", - " 'in',\n", - " 'its',\n", - " 'character',\n", - " 'of',\n", - " 'truth-teller',\n", - " 'but',\n", + " 'that',\n", + " 'he',\n", + " 'felt',\n", + " 'any',\n", + " 'emotion',\n", + " 'akin',\n", + " 'to',\n", + " 'love',\n", " 'for',\n", - " 'the',\n", - " 'beauty',\n", + " 'irene',\n", + " 'adler',\n", + " 'all',\n", + " 'emotions',\n", + " 'and',\n", " 'that',\n", - " 'abounds',\n", - " 'in',\n", - " 'its',\n", - " 'truth',\n", - " 'constituting',\n", - " 'it',\n", - " 'true',\n", + " 'one',\n", + " 'particularly',\n", + " 'were',\n", + " 'abhorrent',\n", " 'to',\n", - " 'these',\n", - " 'i',\n", - " 'present',\n", - " 'the',\n", - " 'composition',\n", - " 'as',\n", - " 'an',\n", - " 'art-product',\n", - " 'alone:—let',\n", - " 'us',\n", - " 'say',\n", - " 'as',\n", - " 'a',\n", - " 'romance',\n", - " 'or',\n", - " 'if',\n", - " 'i',\n", - " 'be',\n", - " 'not',\n", - " 'urging',\n", - " 'too',\n", - " 'lofty',\n", - " 'a',\n", - " 'claim',\n", - " 'as',\n", - " 'a',\n", - " 'poem',\n", - " 'what',\n", + " 'his',\n", + " 'cold',\n", + " 'precise',\n", + " 'but',\n", + " 'admirably',\n", + " 'balanced',\n", + " 'mind',\n", + " 'he',\n", + " 'was',\n", " 'i',\n", - " 'here',\n", - " 'propound',\n", - " 'is',\n", - " 'true_:—therefore',\n", - " 'it',\n", - " 'cannot',\n", - " 'die:—or',\n", - " 'if',\n", - " 'by',\n", - " 'any',\n", - " 'means',\n", + " 'take',\n", " 'it',\n", - " 'be',\n", - " 'now',\n", - " 'trodden',\n", - " 'down',\n", - " 'so',\n", + " 'the',\n", + " 'most',\n", + " 'perfect',\n", + " 'reasoning',\n", + " 'and',\n", + " 'observing',\n", + " 'machine',\n", " 'that',\n", - " 'it',\n", - " 'die',\n", - " 'it',\n", - " 'will',\n", - " 'rise',\n", - " 'again',\n", - " 'to',\n", " 'the',\n", - " 'life',\n", - " 'everlasting',\n", - " 'nevertheless',\n", - " 'it',\n", - " 'is',\n", + " 'world',\n", + " 'has',\n", + " 'seen',\n", + " 'but',\n", " 'as',\n", " 'a',\n", - " 'poem',\n", - " 'only',\n", - " 'that',\n", - " 'i',\n", - " 'wish',\n", - " 'this',\n", - " 'work',\n", - " 'to',\n", - " 'be',\n", - " 'judged',\n", - " 'after',\n", - " 'i',\n", - " 'am',\n", - " 'dead',\n", - " 'e',\n", + " 'lover',\n", + " 'he',\n", + " 'would',\n", + " 'have',\n", + " 'placed',\n", + " 'himself',\n", + " 'in',\n", " 'a',\n", - " 'p',\n", - " 'eureka',\n", - " 'an',\n", - " 'essay',\n", - " 'on',\n", + " 'false',\n", + " 'position',\n", + " 'he',\n", + " 'never',\n", + " 'spoke',\n", + " 'of',\n", " 'the',\n", - " 'material',\n", - " 'and',\n", - " 'spiritual',\n", - " 'universe',\n", - " 'it',\n", - " 'is',\n", - " 'with',\n", - " 'humility',\n", - " 'really',\n", - " 'unassumed—it',\n", - " 'is',\n", + " 'softer',\n", + " 'passions',\n", + " 'save',\n", " 'with',\n", " 'a',\n", - " 'sentiment',\n", - " 'even',\n", - " 'of',\n", - " 'awe—that',\n", - " 'i',\n", - " 'pen',\n", + " 'gibe',\n", + " 'and',\n", + " 'a',\n", + " 'sneer',\n", + " 'they',\n", + " 'were',\n", + " 'admirable',\n", + " 'things',\n", + " 'for',\n", " 'the',\n", - " 'opening',\n", - " 'sentence',\n", - " 'of',\n", - " 'this',\n", - " 'work',\n", + " 'observer--excellent',\n", " 'for',\n", - " 'of',\n", - " 'all',\n", - " 'conceivable',\n", - " 'subjects',\n", - " 'i',\n", - " 'approach',\n", + " 'drawing',\n", " 'the',\n", - " 'reader',\n", - " 'with',\n", + " 'veil',\n", + " 'from',\n", + " \"men's\",\n", + " 'motives',\n", + " 'and',\n", + " 'actions',\n", + " 'but',\n", + " 'for',\n", " 'the',\n", - " 'most',\n", - " 'solemn—the',\n", - " 'most',\n", - " 'comprehensive—the',\n", - " 'most',\n", - " 'difficult—the',\n", - " 'most',\n", - " 'august',\n", - " 'what',\n", - " 'terms',\n", - " 'shall',\n", - " 'i',\n", - " 'find',\n", - " 'sufficiently',\n", - " 'simple',\n", + " 'trained',\n", + " 'reasoner',\n", + " 'to',\n", + " 'admit',\n", + " 'such',\n", + " 'intrusions',\n", + " 'into',\n", + " 'his',\n", + " 'own',\n", + " 'delicate',\n", + " 'and',\n", + " 'finely',\n", + " 'adjusted',\n", + " 'temperament',\n", + " 'was',\n", + " 'to',\n", + " 'introduce',\n", + " 'a',\n", + " 'distracting',\n", + " 'factor',\n", + " 'which',\n", + " 'might',\n", + " 'throw',\n", + " 'a',\n", + " 'doubt',\n", + " 'upon',\n", + " 'all',\n", + " 'his',\n", + " 'mental',\n", + " 'results',\n", + " 'grit',\n", " 'in',\n", - " 'their',\n", - " 'sublimity—sufficiently',\n", - " 'sublime',\n", + " 'a',\n", + " 'sensitive',\n", + " 'instrument',\n", + " 'or',\n", + " 'a',\n", + " 'crack',\n", " 'in',\n", - " 'their',\n", - " 'simplicity—for',\n", - " 'the',\n", - " 'mere',\n", - " 'enunciation',\n", - " 'of',\n", - " 'my',\n", - " 'theme',\n", - " 'i',\n", - " 'design',\n", - " 'to',\n", - " 'speak',\n", + " 'one',\n", " 'of',\n", - " 'the',\n", - " 'physical',\n", - " 'metaphysical',\n", - " 'and',\n", - " 'mathematical—of',\n", - " 'the',\n", - " 'material',\n", - " 'and',\n", - " 'spiritual',\n", - " 'universe:—of',\n", - " 'its',\n", - " 'essence',\n", - " 'its',\n", - " 'origin',\n", - " 'its',\n", - " 'creation',\n", - " 'its',\n", - " 'present',\n", - " 'condition',\n", - " 'and',\n", - " 'its',\n", - " 'destiny',\n", - " 'i',\n", - " 'shall',\n", + " 'his',\n", + " 'own',\n", + " 'high-power',\n", + " 'lenses',\n", + " 'would',\n", + " 'not',\n", " 'be',\n", - " 'so',\n", - " 'rash',\n", - " 'moreover',\n", + " 'more',\n", + " 'disturbing',\n", + " 'than',\n", + " 'a',\n", + " 'strong',\n", + " 'emotion',\n", + " 'in',\n", + " 'a',\n", + " 'nature',\n", + " 'such',\n", " 'as',\n", - " 'to',\n", - " 'challenge',\n", - " 'the',\n", - " 'conclusions',\n", + " 'his',\n", " 'and',\n", - " 'thus',\n", - " 'in',\n", - " 'effect',\n", + " 'yet',\n", + " 'there',\n", + " 'was',\n", + " 'but',\n", + " 'one',\n", + " 'woman',\n", " 'to',\n", - " 'question',\n", + " 'him',\n", + " 'and',\n", + " 'that',\n", + " 'woman',\n", + " 'was',\n", " 'the',\n", - " 'sagacity',\n", - " 'of',\n", - " 'many',\n", + " 'late',\n", + " 'irene',\n", + " 'adler',\n", " 'of',\n", - " 'the',\n", - " 'greatest',\n", + " 'dubious',\n", " 'and',\n", - " 'most',\n", - " 'justly',\n", - " 'reverenced',\n", + " 'questionable',\n", + " 'memory',\n", + " 'i',\n", + " 'had',\n", + " 'seen',\n", + " 'little',\n", " 'of',\n", - " 'men',\n", - " 'in',\n", - " 'the',\n", - " 'beginning',\n", - " 'let',\n", - " 'me',\n", - " 'as',\n", - " 'distinctly',\n", - " 'as',\n", - " 'possible',\n", - " 'announce—not',\n", + " 'holmes',\n", + " 'lately',\n", + " 'my',\n", + " 'marriage',\n", + " 'had',\n", + " 'drifted',\n", + " 'us',\n", + " 'away',\n", + " 'from',\n", + " 'each',\n", + " 'other',\n", + " 'my',\n", + " 'own',\n", + " 'complete',\n", + " 'happiness',\n", + " 'and',\n", " 'the',\n", - " 'theorem',\n", + " 'home-centred',\n", + " 'interests',\n", " 'which',\n", - " 'i',\n", - " 'hope',\n", - " 'to',\n", - " 'demonstrate—for',\n", - " 'whatever',\n", - " 'the',\n", - " 'mathematicians',\n", - " 'may',\n", - " 'assert',\n", - " 'there',\n", - " 'is',\n", - " 'in',\n", - " 'this',\n", - " 'world',\n", - " 'at',\n", - " 'least',\n", - " 'no',\n", - " 'such',\n", - " 'thing',\n", - " 'as',\n", - " 'demonstration—but',\n", - " 'the',\n", - " 'ruling',\n", - " 'idea',\n", - " 'which',\n", - " 'throughout',\n", - " 'this',\n", - " 'volume',\n", - " 'i',\n", - " 'shall',\n", - " 'be',\n", - " 'continually',\n", - " 'endeavoring',\n", - " 'to',\n", - " 'suggest',\n", - " 'my',\n", - " 'general',\n", - " 'proposition',\n", - " 'then',\n", - " 'is',\n", - " 'this:—_in',\n", - " 'the',\n", - " 'original',\n", - " 'unity',\n", - " 'of',\n", + " 'rise',\n", + " 'up',\n", + " 'around',\n", " 'the',\n", + " 'man',\n", + " 'who',\n", " 'first',\n", - " 'thing',\n", - " 'lies',\n", - " 'the',\n", - " 'secondary',\n", - " 'cause',\n", + " 'finds',\n", + " 'himself',\n", + " 'master',\n", " 'of',\n", + " 'his',\n", + " 'own',\n", + " 'establishment',\n", + " 'were',\n", + " 'sufficient',\n", + " 'to',\n", + " 'absorb',\n", " 'all',\n", - " 'things',\n", - " 'with',\n", - " 'the',\n", - " 'germ',\n", + " 'my',\n", + " 'attention',\n", + " 'while',\n", + " 'holmes',\n", + " 'who',\n", + " 'loathed',\n", + " 'every',\n", + " 'form',\n", " 'of',\n", - " 'their',\n", - " 'inevitable',\n", - " 'annihilation',\n", + " 'society',\n", + " 'with',\n", + " 'his',\n", + " 'whole',\n", + " 'bohemian',\n", + " 'soul',\n", + " 'remained',\n", " 'in',\n", - " 'illustration',\n", - " 'of',\n", - " 'this',\n", - " 'idea',\n", - " 'i',\n", - " 'propose',\n", + " 'our',\n", + " 'lodgings',\n", + " 'in',\n", + " 'baker',\n", + " 'street',\n", + " 'buried',\n", + " 'among',\n", + " 'his',\n", + " 'old',\n", + " 'books',\n", + " 'and',\n", + " 'alternating',\n", + " 'from',\n", + " 'week',\n", " 'to',\n", - " 'take',\n", - " 'such',\n", - " 'a',\n", - " 'survey',\n", - " 'of',\n", + " 'week',\n", + " 'between',\n", + " 'cocaine',\n", + " 'and',\n", + " 'ambition',\n", " 'the',\n", - " 'universe',\n", - " 'that',\n", + " 'drowsiness',\n", + " 'of',\n", " 'the',\n", - " 'mind',\n", - " 'may',\n", - " 'be',\n", - " 'able',\n", - " 'really',\n", - " 'to',\n", - " 'receive',\n", + " 'drug',\n", " 'and',\n", - " 'to',\n", - " 'perceive',\n", - " 'an',\n", - " 'individual',\n", - " 'impression',\n", - " 'he',\n", - " 'who',\n", - " 'from',\n", " 'the',\n", - " 'top',\n", + " 'fierce',\n", + " 'energy',\n", " 'of',\n", - " 'ætna',\n", - " 'casts',\n", " 'his',\n", - " 'eyes',\n", - " 'leisurely',\n", - " 'around',\n", - " 'is',\n", - " 'affected',\n", - " 'chiefly',\n", + " 'own',\n", + " 'keen',\n", + " 'nature',\n", + " 'he',\n", + " 'was',\n", + " 'still',\n", + " 'as',\n", + " 'ever',\n", + " 'deeply',\n", + " 'attracted',\n", " 'by',\n", " 'the',\n", - " 'extent',\n", + " 'study',\n", + " 'of',\n", + " 'crime',\n", " 'and',\n", - " 'diversity',\n", + " 'occupied',\n", + " 'his',\n", + " 'immense',\n", + " 'faculties',\n", + " 'and',\n", + " 'extraordinary',\n", + " 'powers',\n", " 'of',\n", - " 'the',\n", - " 'scene',\n", - " 'only',\n", + " 'observation',\n", + " 'in',\n", + " 'following',\n", + " 'out',\n", + " 'those',\n", + " 'clews',\n", + " 'and',\n", + " 'clearing',\n", + " 'up',\n", + " 'those',\n", + " 'mysteries',\n", + " 'which',\n", + " 'had',\n", + " 'been',\n", + " 'abandoned',\n", + " 'as',\n", + " 'hopeless',\n", " 'by',\n", - " 'a',\n", - " 'rapid',\n", - " 'whirling',\n", - " 'on',\n", + " 'the',\n", + " 'official',\n", + " 'police',\n", + " 'from',\n", + " 'time',\n", + " 'to',\n", + " 'time',\n", + " 'i',\n", + " 'heard',\n", + " 'some',\n", + " 'vague',\n", + " 'account',\n", + " 'of',\n", " 'his',\n", - " 'heel',\n", - " 'could',\n", - " 'he',\n", - " 'hope',\n", + " 'doings',\n", + " 'of',\n", + " 'his',\n", + " 'summons',\n", " 'to',\n", - " 'comprehend',\n", - " 'the',\n", - " 'panorama',\n", + " 'odessa',\n", " 'in',\n", " 'the',\n", - " 'sublimity',\n", + " 'case',\n", " 'of',\n", - " 'its',\n", - " 'oneness',\n", - " 'but',\n", - " 'as',\n", - " 'on',\n", " 'the',\n", - " 'summit',\n", - " 'of',\n", - " 'ætna',\n", - " 'no',\n", - " 'man',\n", - " 'has',\n", - " 'thought',\n", + " 'trepoff',\n", + " 'murder',\n", " 'of',\n", - " 'whirling',\n", - " 'on',\n", - " 'his',\n", - " 'heel',\n", - " 'so',\n", - " 'no',\n", - " 'man',\n", - " 'has',\n", - " 'ever',\n", - " 'taken',\n", - " 'into',\n", " 'his',\n", - " 'brain',\n", + " 'clearing',\n", + " 'up',\n", + " 'of',\n", " 'the',\n", - " 'full',\n", - " 'uniqueness',\n", + " 'singular',\n", + " 'tragedy',\n", " 'of',\n", " 'the',\n", - " 'prospect',\n", + " 'atkinson',\n", + " 'brothers',\n", + " 'at',\n", + " 'trincomalee',\n", " 'and',\n", - " 'so',\n", - " 'again',\n", - " 'whatever',\n", - " 'considerations',\n", - " 'lie',\n", - " 'involved',\n", - " 'in',\n", - " 'this',\n", - " 'uniqueness',\n", - " 'have',\n", - " 'as',\n", - " 'yet',\n", - " 'no',\n", - " 'practical',\n", - " 'existence',\n", - " 'for',\n", - " 'mankind',\n", - " 'i',\n", - " 'do',\n", - " 'not',\n", - " 'know',\n", - " 'a',\n", - " 'treatise',\n", - " 'in',\n", - " 'which',\n", - " 'a',\n", - " 'survey',\n", + " 'finally',\n", " 'of',\n", " 'the',\n", - " 'universe_—using',\n", - " 'the',\n", - " 'word',\n", - " 'in',\n", - " 'its',\n", - " 'most',\n", - " 'comprehensive',\n", + " 'mission',\n", + " 'which',\n", + " 'he',\n", + " 'had',\n", + " 'accomplished',\n", + " 'so',\n", + " 'delicately',\n", " 'and',\n", - " 'only',\n", - " 'legitimate',\n", - " 'acceptation—is',\n", - " 'taken',\n", - " 'at',\n", - " 'all:—and',\n", - " 'it',\n", - " 'may',\n", - " 'be',\n", - " 'as',\n", - " 'well',\n", - " 'here',\n", - " 'to',\n", - " 'mention',\n", - " 'that',\n", - " 'by',\n", - " 'the',\n", - " 'term',\n", - " 'universe',\n", - " 'wherever',\n", - " 'employed',\n", - " 'without',\n", - " 'qualification',\n", - " 'in',\n", - " 'this',\n", - " 'essay',\n", - " 'i',\n", - " 'mean',\n", - " 'to',\n", - " 'designate',\n", + " 'successfully',\n", + " 'for',\n", " 'the',\n", - " 'utmost',\n", - " 'conceivable',\n", - " 'expanse',\n", + " 'reigning',\n", + " 'family',\n", " 'of',\n", - " 'space',\n", + " 'holland',\n", + " 'beyond',\n", + " 'these',\n", + " 'signs',\n", + " 'of',\n", + " 'his',\n", + " 'activity',\n", + " 'however',\n", + " 'which',\n", + " 'i',\n", + " 'merely',\n", + " 'shared',\n", " 'with',\n", " 'all',\n", - " 'things',\n", - " 'spiritual',\n", - " 'and',\n", - " 'material',\n", - " 'that',\n", - " 'can',\n", - " 'be',\n", - " 'imagined',\n", - " 'to',\n", - " 'exist',\n", - " 'within',\n", " 'the',\n", - " 'compass',\n", + " 'readers',\n", " 'of',\n", - " 'that',\n", - " 'expanse',\n", - " 'in',\n", - " 'speaking',\n", - " 'of',\n", - " 'what',\n", - " 'is',\n", - " 'ordinarily',\n", - " 'implied',\n", - " 'by',\n", " 'the',\n", - " 'expression',\n", - " 'universe',\n", + " 'daily',\n", + " 'press',\n", " 'i',\n", - " 'shall',\n", - " 'take',\n", - " 'a',\n", - " 'phrase',\n", - " 'of',\n", - " 'limitation—“the',\n", - " 'universe',\n", - " 'of',\n", - " 'stars',\n", - " 'why',\n", - " 'this',\n", - " 'distinction',\n", - " 'is',\n", - " 'considered',\n", - " 'necessary',\n", - " 'will',\n", - " 'be',\n", - " 'seen',\n", - " 'in',\n", - " 'the',\n", - " 'sequel',\n", - " 'but',\n", - " 'even',\n", + " 'knew',\n", + " 'little',\n", " 'of',\n", - " 'treatises',\n", + " 'my',\n", + " 'former',\n", + " 'friend',\n", + " 'and',\n", + " 'companion',\n", + " 'one',\n", + " 'night--it',\n", + " 'was',\n", " 'on',\n", " 'the',\n", - " 'really',\n", - " 'limited',\n", - " 'although',\n", - " 'always',\n", - " 'assumed',\n", - " 'as',\n", - " 'the',\n", - " 'un_limited',\n", - " 'universe',\n", + " 'th',\n", " 'of',\n", - " 'stars',\n", + " 'march',\n", " 'i',\n", - " 'know',\n", - " 'none',\n", - " 'in',\n", - " 'which',\n", - " 'a',\n", - " 'survey',\n", - " 'even',\n", - " 'of',\n", - " 'this',\n", - " 'limited',\n", - " 'universe',\n", - " 'is',\n", - " 'so',\n", - " 'taken',\n", - " 'as',\n", - " 'to',\n", - " 'warrant',\n", - " 'deductions',\n", + " 'was',\n", + " 'returning',\n", " 'from',\n", - " 'its',\n", - " 'individuality',\n", - " 'the',\n", - " 'nearest',\n", - " 'approach',\n", + " 'a',\n", + " 'journey',\n", " 'to',\n", - " 'such',\n", " 'a',\n", - " 'work',\n", - " 'is',\n", - " 'made',\n", + " 'patient',\n", + " 'for',\n", + " 'i',\n", + " 'had',\n", + " 'now',\n", + " 'returned',\n", + " 'to',\n", + " 'civil',\n", + " 'practice',\n", + " 'when',\n", + " 'my',\n", + " 'way',\n", + " 'led',\n", + " 'me',\n", + " 'through',\n", + " 'baker',\n", + " 'street',\n", + " 'as',\n", + " 'i',\n", + " 'passed',\n", + " 'the',\n", + " 'well-remembered',\n", + " 'door',\n", + " 'which',\n", + " 'must',\n", + " 'always',\n", + " 'be',\n", + " 'associated',\n", " 'in',\n", + " 'my',\n", + " 'mind',\n", + " 'with',\n", + " 'my',\n", + " 'wooing',\n", + " 'and',\n", + " 'with',\n", " 'the',\n", - " 'cosmos',\n", + " 'dark',\n", + " 'incidents',\n", " 'of',\n", - " 'alexander',\n", - " 'von',\n", - " 'humboldt',\n", - " 'he',\n", - " 'presents',\n", " 'the',\n", - " 'subject',\n", - " 'however',\n", - " 'not',\n", - " 'in',\n", - " 'its',\n", - " 'individuality',\n", - " 'but',\n", + " 'study',\n", " 'in',\n", - " 'its',\n", - " 'generality',\n", + " 'scarlet',\n", + " 'i',\n", + " 'was',\n", + " 'seized',\n", + " 'with',\n", + " 'a',\n", + " 'keen',\n", + " 'desire',\n", + " 'to',\n", + " 'see',\n", + " 'holmes',\n", + " 'again',\n", + " 'and',\n", + " 'to',\n", + " 'know',\n", + " 'how',\n", + " 'he',\n", + " 'was',\n", + " 'employing',\n", " 'his',\n", - " 'theme',\n", - " 'in',\n", - " 'its',\n", - " 'last',\n", - " 'result',\n", - " 'is',\n", - " 'the',\n", - " 'law',\n", - " 'of',\n", - " 'each',\n", - " 'portion',\n", - " 'of',\n", - " 'the',\n", - " 'merely',\n", - " 'physical',\n", - " 'universe',\n", + " 'extraordinary',\n", + " 'powers',\n", + " 'his',\n", + " 'rooms',\n", + " 'were',\n", + " 'brilliantly',\n", + " 'lighted',\n", + " 'and',\n", + " 'even',\n", " 'as',\n", - " 'this',\n", - " 'law',\n", - " 'is',\n", - " 'related',\n", - " 'to',\n", - " 'the',\n", - " 'laws',\n", - " 'of',\n", - " 'every',\n", - " 'other',\n", - " 'portion',\n", - " 'of',\n", - " 'this',\n", - " 'merely',\n", - " 'physical',\n", - " 'universe',\n", + " 'i',\n", + " 'looked',\n", + " 'up',\n", + " 'i',\n", + " 'saw',\n", " 'his',\n", - " 'design',\n", - " 'is',\n", - " 'simply',\n", - " 'synœretical',\n", + " 'tall',\n", + " 'spare',\n", + " 'figure',\n", + " 'pass',\n", + " 'twice',\n", " 'in',\n", " 'a',\n", - " 'word',\n", + " 'dark',\n", + " 'silhouette',\n", + " 'against',\n", + " 'the',\n", + " 'blind',\n", " 'he',\n", - " 'discusses',\n", + " 'was',\n", + " 'pacing',\n", " 'the',\n", - " 'universality',\n", - " 'of',\n", - " 'material',\n", - " 'relation',\n", + " 'room',\n", + " 'swiftly',\n", + " 'eagerly',\n", + " 'with',\n", + " 'his',\n", + " 'head',\n", + " 'sunk',\n", + " 'upon',\n", + " 'his',\n", + " 'chest',\n", " 'and',\n", - " 'discloses',\n", - " 'to',\n", - " 'the',\n", - " 'eye',\n", - " 'of',\n", - " 'philosophy',\n", - " 'whatever',\n", - " 'inferences',\n", - " 'have',\n", - " 'hitherto',\n", - " 'lain',\n", - " 'hidden',\n", + " 'his',\n", + " 'hands',\n", + " 'clasped',\n", " 'behind',\n", - " 'this',\n", - " 'universality',\n", - " 'but',\n", - " 'however',\n", - " 'admirable',\n", - " 'be',\n", - " 'the',\n", - " 'succinctness',\n", - " 'with',\n", - " 'which',\n", + " 'him',\n", + " 'to',\n", + " 'me',\n", + " 'who',\n", + " 'knew',\n", + " 'his',\n", + " 'every',\n", + " 'mood',\n", + " 'and',\n", + " 'habit',\n", + " 'his',\n", + " 'attitude',\n", + " 'and',\n", + " 'manner',\n", + " 'told',\n", + " 'their',\n", + " 'own',\n", + " 'story',\n", " 'he',\n", - " 'has',\n", - " 'treated',\n", - " 'each',\n", - " 'particular',\n", - " 'point',\n", + " 'was',\n", + " 'at',\n", + " 'work',\n", + " 'again',\n", + " 'he',\n", + " 'had',\n", + " 'risen',\n", + " 'out',\n", " 'of',\n", " 'his',\n", - " 'topic',\n", + " 'drug-created',\n", + " 'dreams',\n", + " 'and',\n", + " 'was',\n", + " 'hot',\n", + " 'upon',\n", " 'the',\n", - " 'mere',\n", - " 'multiplicity',\n", - " 'of',\n", - " 'these',\n", - " 'points',\n", - " 'occasions',\n", - " 'necessarily',\n", - " 'an',\n", - " 'amount',\n", + " 'scent',\n", " 'of',\n", - " 'detail',\n", + " 'some',\n", + " 'new',\n", + " 'problem',\n", + " 'i',\n", + " 'rang',\n", + " 'the',\n", + " 'bell',\n", " 'and',\n", - " 'thus',\n", - " 'an',\n", - " 'involution',\n", - " 'of',\n", - " 'idea',\n", + " 'was',\n", + " 'shown',\n", + " 'up',\n", + " 'to',\n", + " 'the',\n", + " 'chamber',\n", " 'which',\n", - " 'precludes',\n", - " 'all',\n", - " 'individuality',\n", - " 'of',\n", - " 'impression',\n", + " 'had',\n", + " 'formerly',\n", + " 'been',\n", + " 'in',\n", + " 'part',\n", + " 'my',\n", + " 'own',\n", + " 'his',\n", + " 'manner',\n", + " 'was',\n", + " 'not',\n", + " 'effusive',\n", " 'it',\n", - " 'seems',\n", + " 'seldom',\n", + " 'was',\n", + " 'but',\n", + " 'he',\n", + " 'was',\n", + " 'glad',\n", + " 'i',\n", + " 'think',\n", " 'to',\n", + " 'see',\n", " 'me',\n", - " 'that',\n", - " 'in',\n", - " 'aiming',\n", - " 'at',\n", - " 'this',\n", - " 'latter',\n", - " 'effect',\n", - " 'and',\n", - " 'through',\n", - " 'it',\n", - " 'at',\n", - " 'the',\n", - " 'consequences—the',\n", - " 'conclusions—the',\n", - " 'suggestions—the',\n", - " 'speculations—or',\n", - " 'if',\n", - " 'nothing',\n", - " 'better',\n", - " 'offer',\n", - " 'itself',\n", - " 'the',\n", - " 'mere',\n", - " 'guesses',\n", - " 'which',\n", - " 'may',\n", - " 'result',\n", - " 'from',\n", - " 'it—we',\n", - " 'require',\n", - " 'something',\n", - " 'like',\n", + " 'with',\n", + " 'hardly',\n", " 'a',\n", - " 'mental',\n", - " 'gyration',\n", - " 'on',\n", - " 'the',\n", - " 'heel',\n", - " 'we',\n", - " 'need',\n", - " 'so',\n", - " 'rapid',\n", + " 'word',\n", + " 'spoken',\n", + " 'but',\n", + " 'with',\n", " 'a',\n", - " 'revolution',\n", - " 'of',\n", - " 'all',\n", - " 'things',\n", - " 'about',\n", - " 'the',\n", - " 'central',\n", - " 'point',\n", + " 'kindly',\n", + " 'eye',\n", + " 'he',\n", + " 'waved',\n", + " 'me',\n", + " 'to',\n", + " 'an',\n", + " 'armchair',\n", + " 'threw',\n", + " 'across',\n", + " 'his',\n", + " 'case',\n", " 'of',\n", - " 'sight',\n", - " 'that',\n", - " 'while',\n", - " 'the',\n", - " 'minutiæ',\n", - " 'vanish',\n", - " 'altogether',\n", - " 'even',\n", + " 'cigars',\n", + " 'and',\n", + " 'indicated',\n", + " 'a',\n", + " 'spirit',\n", + " 'case',\n", + " 'and',\n", + " 'a',\n", + " 'gasogene',\n", + " 'in',\n", " 'the',\n", - " 'more',\n", - " 'conspicuous',\n", - " 'objects',\n", - " 'become',\n", - " 'blended',\n", - " 'into',\n", - " 'one',\n", - " 'among',\n", + " 'corner',\n", + " 'then',\n", + " 'he',\n", + " 'stood',\n", + " 'before',\n", " 'the',\n", - " 'vanishing',\n", - " 'minutiæ',\n", + " 'fire',\n", + " 'and',\n", + " 'looked',\n", + " 'me',\n", + " 'over',\n", " 'in',\n", + " 'his',\n", + " 'singular',\n", + " 'introspective',\n", + " 'fashion',\n", + " 'wedlock',\n", + " 'suits',\n", + " 'you',\n", + " 'he',\n", + " 'remarked',\n", + " 'i',\n", + " 'think',\n", + " 'watson',\n", + " 'that',\n", + " 'you',\n", + " 'have',\n", + " 'put',\n", + " 'on',\n", + " 'seven',\n", + " 'and',\n", " 'a',\n", - " 'survey',\n", - " 'of',\n", - " 'this',\n", - " 'kind',\n", - " 'would',\n", - " 'be',\n", - " 'all',\n", - " 'exclusively',\n", - " 'terrestrial',\n", - " 'matters',\n", - " 'the',\n", - " 'earth',\n", - " 'would',\n", - " 'be',\n", - " 'considered',\n", - " 'in',\n", - " 'its',\n", - " 'planetary',\n", - " 'relations',\n", - " 'alone',\n", + " 'half',\n", + " 'pounds',\n", + " 'since',\n", + " 'i',\n", + " 'saw',\n", + " 'you',\n", + " 'seven',\n", + " 'i',\n", + " 'answered',\n", + " 'indeed',\n", + " 'i',\n", + " 'should',\n", + " 'have',\n", + " 'thought',\n", " 'a',\n", - " 'man',\n", - " 'in',\n", - " 'this',\n", - " 'view',\n", - " 'becomes',\n", - " 'mankind',\n", - " 'mankind',\n", + " 'little',\n", + " 'more',\n", + " 'just',\n", " 'a',\n", - " 'member',\n", - " 'of',\n", - " 'the',\n", - " 'cosmical',\n", - " 'family',\n", - " 'of',\n", - " 'intelligences',\n", + " 'trifle',\n", + " 'more',\n", + " 'i',\n", + " 'fancy',\n", + " 'watson',\n", " 'and',\n", - " 'now',\n", - " 'before',\n", - " 'proceeding',\n", - " 'to',\n", - " 'our',\n", - " 'subject',\n", - " 'proper',\n", - " 'let',\n", + " 'in',\n", + " 'practice',\n", + " 'again',\n", + " 'i',\n", + " 'observe',\n", + " 'you',\n", + " 'did',\n", + " 'not',\n", + " 'tell',\n", " 'me',\n", - " 'beg',\n", - " 'the',\n", - " 'reader’s',\n", - " 'attention',\n", + " 'that',\n", + " 'you',\n", + " 'intended',\n", " 'to',\n", - " 'an',\n", - " 'extract',\n", - " 'or',\n", - " 'two',\n", - " 'from',\n", + " 'go',\n", + " 'into',\n", + " 'harness',\n", + " 'then',\n", + " 'how',\n", + " 'do',\n", + " 'you',\n", + " 'know',\n", + " 'i',\n", + " 'see',\n", + " 'it',\n", + " 'i',\n", + " 'deduce',\n", + " 'it',\n", + " 'how',\n", + " 'do',\n", + " 'i',\n", + " 'know',\n", + " 'that',\n", + " 'you',\n", + " 'have',\n", + " 'been',\n", + " 'getting',\n", + " 'yourself',\n", + " 'very',\n", + " 'wet',\n", + " 'lately',\n", + " 'and',\n", + " 'that',\n", + " 'you',\n", + " 'have',\n", " 'a',\n", - " 'somewhat',\n", - " 'remarkable',\n", - " 'letter',\n", - " 'which',\n", - " 'appears',\n", - " 'to',\n", + " 'most',\n", + " 'clumsy',\n", + " 'and',\n", + " 'careless',\n", + " 'servant',\n", + " 'girl',\n", + " 'my',\n", + " 'dear',\n", + " 'holmes',\n", + " 'said',\n", + " 'i',\n", + " 'this',\n", + " 'is',\n", + " 'too',\n", + " 'much',\n", + " 'you',\n", + " 'would',\n", + " 'certainly',\n", " 'have',\n", " 'been',\n", - " 'found',\n", - " 'corked',\n", + " 'burned',\n", + " 'had',\n", + " 'you',\n", + " 'lived',\n", + " 'a',\n", + " 'few',\n", + " 'centuries',\n", + " 'ago',\n", + " 'it',\n", + " 'is',\n", + " 'true',\n", + " 'that',\n", + " 'i',\n", + " 'had',\n", + " 'a',\n", + " 'country',\n", + " 'walk',\n", + " 'on',\n", + " 'thursday',\n", + " 'and',\n", + " 'came',\n", + " 'home',\n", " 'in',\n", " 'a',\n", - " 'bottle',\n", + " 'dreadful',\n", + " 'mess',\n", + " 'but',\n", + " 'as',\n", + " 'i',\n", + " 'have',\n", + " 'changed',\n", + " 'my',\n", + " 'clothes',\n", + " 'i',\n", + " \"can't\",\n", + " 'imagine',\n", + " 'how',\n", + " 'you',\n", + " 'deduce',\n", + " 'it',\n", + " 'as',\n", + " 'to',\n", + " 'mary',\n", + " 'jane',\n", + " 'she',\n", + " 'is',\n", + " 'incorrigible',\n", " 'and',\n", - " 'floating',\n", + " 'my',\n", + " 'wife',\n", + " 'has',\n", + " 'given',\n", + " 'her',\n", + " 'notice',\n", + " 'but',\n", + " 'there',\n", + " 'again',\n", + " 'i',\n", + " 'fail',\n", + " 'to',\n", + " 'see',\n", + " 'how',\n", + " 'you',\n", + " 'work',\n", + " 'it',\n", + " 'out',\n", + " 'he',\n", + " 'chuckled',\n", + " 'to',\n", + " 'himself',\n", + " 'and',\n", + " 'rubbed',\n", + " 'his',\n", + " 'long',\n", + " 'nervous',\n", + " 'hands',\n", + " 'together',\n", + " 'it',\n", + " 'is',\n", + " 'simplicity',\n", + " 'itself',\n", + " 'said',\n", + " 'he',\n", + " 'my',\n", + " 'eyes',\n", + " 'tell',\n", + " 'me',\n", + " 'that',\n", " 'on',\n", " 'the',\n", - " 'mare',\n", - " 'tenebrarum_—an',\n", - " 'ocean',\n", - " 'well',\n", - " 'described',\n", - " 'by',\n", + " 'inside',\n", + " 'of',\n", + " 'your',\n", + " 'left',\n", + " 'shoe',\n", + " 'just',\n", + " 'where',\n", + " 'the',\n", + " 'firelight',\n", + " 'strikes',\n", + " 'it',\n", " 'the',\n", - " 'nubian',\n", - " 'geographer',\n", - " 'ptolemy',\n", - " 'hephestion',\n", - " 'but',\n", - " 'little',\n", - " 'frequented',\n", - " 'in',\n", - " 'modern',\n", - " 'days',\n", - " 'unless',\n", + " 'leather',\n", + " 'is',\n", + " 'scored',\n", + " 'by',\n", + " 'six',\n", + " 'almost',\n", + " 'parallel',\n", + " 'cuts',\n", + " 'obviously',\n", + " 'they',\n", + " 'have',\n", + " 'been',\n", + " 'caused',\n", " 'by',\n", + " 'someone',\n", + " 'who',\n", + " 'has',\n", + " 'very',\n", + " 'carelessly',\n", + " 'scraped',\n", + " 'round',\n", + " 'the',\n", + " 'edges',\n", + " 'of',\n", " 'the',\n", - " 'transcendentalists',\n", + " 'sole',\n", + " 'in',\n", + " 'order',\n", + " 'to',\n", + " 'remove',\n", + " 'crusted',\n", + " 'mud',\n", + " 'from',\n", + " 'it',\n", + " 'hence',\n", + " 'you',\n", + " 'see',\n", + " 'my',\n", + " 'double',\n", + " 'deduction',\n", + " 'that',\n", + " 'you',\n", + " 'had',\n", + " 'been',\n", + " 'out',\n", + " 'in',\n", + " 'vile',\n", + " 'weather',\n", " 'and',\n", - " 'some',\n", - " 'other',\n", - " 'divers',\n", - " 'for',\n", - " 'crotchets',\n", + " 'that',\n", + " 'you',\n", + " 'had',\n", + " 'a',\n", + " 'particularly',\n", + " 'malignant',\n", + " 'boot-slicking',\n", + " 'specimen',\n", + " 'of',\n", + " 'the',\n", + " 'london',\n", + " 'slavey',\n", + " 'as',\n", + " 'to',\n", + " 'your',\n", + " 'practice',\n", + " 'if',\n", + " 'a',\n", + " 'gentleman',\n", + " 'walks',\n", + " 'into',\n", + " 'my',\n", + " 'rooms',\n", + " 'smelling',\n", + " 'of',\n", + " 'iodoform',\n", + " 'with',\n", + " 'a',\n", + " 'black',\n", + " 'mark',\n", + " 'of',\n", + " 'nitrate',\n", + " 'of',\n", + " 'silver',\n", + " 'upon',\n", + " 'his',\n", + " 'right',\n", + " 'forefinger',\n", + " 'and',\n", + " 'a',\n", + " 'bulge',\n", + " 'on',\n", " 'the',\n", - " 'date',\n", + " 'side',\n", " 'of',\n", - " 'this',\n", - " 'letter',\n", - " 'i',\n", - " 'confess',\n", - " 'surprises',\n", - " 'me',\n", - " 'even',\n", - " 'more',\n", - " 'particularly',\n", - " 'than',\n", - " 'its',\n", - " 'contents',\n", + " 'his',\n", + " 'top',\n", + " 'hat',\n", + " 'to',\n", + " 'show',\n", + " 'where',\n", + " 'he',\n", + " 'has',\n", + " 'secreted',\n", + " 'his',\n", " ...]]" ] }, @@ -1314,123 +1287,123 @@ " \n", " \n", " 1\n", - " 18658\n", - " 2\n", - " \n", - " \n", - " 8193\n", - " 7\n", + " 4097\n", + " 1\n", " \n", " \n", " 8194\n", - " 2\n", - " \n", - " \n", - " 8195\n", - " 17\n", - " \n", - " \n", - " 8197\n", - " 5\n", + " 1\n", " \n", " \n", - " 1\n", - " 2\n", + " 12291\n", + " 14\n", " \n", " \n", - " 19116\n", + " 20484\n", " 1\n", " \n", " \n", - " 16394\n", + " 20485\n", " 1\n", " \n", " \n", - " 16395\n", + " 8199\n", " 1\n", " \n", " \n", - " 2\n", + " 12296\n", " 1\n", " \n", " \n", - " 15\n", + " 12305\n", " 3\n", " \n", " \n", - " 16400\n", + " 4099\n", " 1\n", " \n", " \n", - " 8210\n", + " 16404\n", + " 2\n", + " \n", + " \n", + " 12313\n", " 3\n", " \n", " \n", - " 19\n", + " 4638\n", " 1\n", " \n", " \n", - " 16404\n", + " 4127\n", " 2\n", " \n", " \n", - " 16405\n", + " 32\n", " 1\n", " \n", " \n", - " 25\n", + " 8227\n", " 1\n", " \n", " \n", - " 16410\n", + " 20517\n", " 1\n", " \n", " \n", - " 16412\n", + " 38\n", " 1\n", " \n", " \n", - " 29\n", - " 1\n", + " 12328\n", + " 2\n", " \n", " \n", - " 4101\n", + " 4103\n", " 1\n", " \n", " \n", - " 6832\n", + " 16428\n", " 2\n", " \n", " \n", - " 16418\n", - " 2\n", + " 16429\n", + " 1\n", " \n", " \n", - " 35\n", - " 10\n", + " 4142\n", + " 1\n", " \n", " \n", - " 38\n", + " 4144\n", " 1\n", " \n", " \n", - " 39\n", - " 14\n", + " 4147\n", + " 1\n", " \n", " \n", - " 8234\n", - " 13\n", + " 4148\n", + " 3\n", " \n", " \n", - " 8235\n", - " 1\n", + " 12341\n", + " 2\n", + " \n", + " \n", + " 16438\n", + " 2\n", " \n", " \n", - " 8236\n", - " 13\n", + " 16441\n", + " 3\n", + " \n", + " \n", + " 6154\n", + " 1\n", " \n", " \n", - " 8237\n", + " 20543\n", " 1\n", " \n", " \n", @@ -1440,196 +1413,196 @@ " \n", " \n", " 16\n", - " 16294\n", - " 0\n", + " 18376\n", + " 1\n", " \n", " \n", - " 23196\n", - " 0\n", + " 18381\n", + " 32\n", " \n", " \n", - " 16301\n", - " 0\n", + " 12238\n", + " 1\n", " \n", " \n", - " 24503\n", - " 0\n", + " 17741\n", + " 1\n", " \n", " \n", - " 16315\n", - " 0\n", + " 4048\n", + " 1\n", " \n", " \n", - " 16319\n", - " 0\n", + " 8149\n", + " 9\n", " \n", " \n", - " 10912\n", - " 0\n", + " 17401\n", + " 1\n", " \n", " \n", - " 8130\n", - " 0\n", + " 12964\n", + " 1\n", " \n", " \n", - " 8132\n", - " 0\n", + " 14298\n", + " 1\n", " \n", " \n", - " 24517\n", - " 0\n", + " 24539\n", + " 1\n", " \n", " \n", - " 24520\n", - " 0\n", + " 12252\n", + " 1\n", " \n", " \n", - " 24524\n", - " 0\n", + " 22497\n", + " 1\n", " \n", " \n", - " 6818\n", - " 0\n", + " 4067\n", + " 1\n", " \n", " \n", - " 24532\n", - " 0\n", + " 10216\n", + " 1\n", " \n", " \n", - " 24549\n", - " 0\n", + " 16364\n", + " 1\n", " \n", " \n", - " 16348\n", - " 0\n", + " 2029\n", + " 1\n", " \n", " \n", - " 8160\n", - " 0\n", + " 1500\n", + " 1\n", " \n", " \n", - " 24545\n", - " 0\n", + " 23549\n", + " 1\n", " \n", " \n", - " 24548\n", - " 0\n", + " 16370\n", + " 2\n", " \n", " \n", - " 2726\n", - " 0\n", + " 6131\n", + " 1\n", " \n", " \n", - " 24550\n", - " 0\n", + " 14324\n", + " 1\n", " \n", " \n", - " 24551\n", - " 0\n", + " 12278\n", + " 1\n", " \n", " \n", - " 13911\n", - " 0\n", + " 17065\n", + " 1\n", " \n", " \n", - " 11456\n", - " 0\n", + " 18424\n", + " 1\n", " \n", " \n", - " 8174\n", - " 0\n", + " 15700\n", + " 1\n", " \n", " \n", - " 16369\n", - " 0\n", + " 136\n", + " 1\n", " \n", " \n", - " 8182\n", - " 0\n", + " 14331\n", + " 1\n", " \n", " \n", - " 16377\n", - " 0\n", + " 12284\n", + " 1\n", " \n", " \n", - " 8188\n", - " 0\n", + " 12285\n", + " 2\n", " \n", " \n", - " 8191\n", - " 0\n", + " 8533\n", + " 2\n", " \n", " \n", "\n", - "

52926 rows × 1 columns

\n", + "

56956 rows × 1 columns

\n", "" ], "text/plain": [ " 0\n", "doc_id token_id \n", - "1 18658 2\n", - " 8193 7\n", - " 8194 2\n", - " 8195 17\n", - " 8197 5\n", - " 1 2\n", - " 19116 1\n", - " 16394 1\n", - " 16395 1\n", - " 2 1\n", - " 15 3\n", - " 16400 1\n", - " 8210 3\n", - " 19 1\n", + "1 4097 1\n", + " 8194 1\n", + " 12291 14\n", + " 20484 1\n", + " 20485 1\n", + " 8199 1\n", + " 12296 1\n", + " 12305 3\n", + " 4099 1\n", " 16404 2\n", - " 16405 1\n", - " 25 1\n", - " 16410 1\n", - " 16412 1\n", - " 29 1\n", - " 4101 1\n", - " 6832 2\n", - " 16418 2\n", - " 35 10\n", + " 12313 3\n", + " 4638 1\n", + " 4127 2\n", + " 32 1\n", + " 8227 1\n", + " 20517 1\n", " 38 1\n", - " 39 14\n", - " 8234 13\n", - " 8235 1\n", - " 8236 13\n", - " 8237 1\n", + " 12328 2\n", + " 4103 1\n", + " 16428 2\n", + " 16429 1\n", + " 4142 1\n", + " 4144 1\n", + " 4147 1\n", + " 4148 3\n", + " 12341 2\n", + " 16438 2\n", + " 16441 3\n", + " 6154 1\n", + " 20543 1\n", "... ..\n", - "16 16294 0\n", - " 23196 0\n", - " 16301 0\n", - " 24503 0\n", - " 16315 0\n", - " 16319 0\n", - " 10912 0\n", - " 8130 0\n", - " 8132 0\n", - " 24517 0\n", - " 24520 0\n", - " 24524 0\n", - " 6818 0\n", - " 24532 0\n", - " 24549 0\n", - " 16348 0\n", - " 8160 0\n", - " 24545 0\n", - " 24548 0\n", - " 2726 0\n", - " 24550 0\n", - " 24551 0\n", - " 13911 0\n", - " 11456 0\n", - " 8174 0\n", - " 16369 0\n", - " 8182 0\n", - " 16377 0\n", - " 8188 0\n", - " 8191 0\n", + "16 18376 1\n", + " 18381 32\n", + " 12238 1\n", + " 17741 1\n", + " 4048 1\n", + " 8149 9\n", + " 17401 1\n", + " 12964 1\n", + " 14298 1\n", + " 24539 1\n", + " 12252 1\n", + " 22497 1\n", + " 4067 1\n", + " 10216 1\n", + " 16364 1\n", + " 2029 1\n", + " 1500 1\n", + " 23549 1\n", + " 16370 2\n", + " 6131 1\n", + " 14324 1\n", + " 12278 1\n", + " 17065 1\n", + " 18424 1\n", + " 15700 1\n", + " 136 1\n", + " 14331 1\n", + " 12284 1\n", + " 12285 2\n", + " 8533 2\n", "\n", - "[52926 rows x 1 columns]" + "[56956 rows x 1 columns]" ] }, "execution_count": 10, @@ -1650,22 +1623,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "'import pandas as pd\\n\\ndoc2id = {value : key for key, value in doc_ids.items()}\\ntype2id = {value : key for key, value in id_types.items()}\\n\\ncols = [doc2id[key] for key in set(sparse_bow.index.get_level_values(\"doc_id\"))]\\n#idx = [type2id[key] for key in set(sparse_bow.index.get_level_values(\"token_id\"))]\\n\\nset(sparse_bow.index.get_level_values(\"token_id\"))\\n\\n#doctopic_matrix = pd.DataFrame(columns=cols, index=idx)'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "'''import pandas as pd\n", "\n", @@ -1689,7 +1651,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "collapsed": true }, @@ -1706,40 +1668,22 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:17 INFO preprocessing: Removing features ...\n", - "31-Jan-2017 14:49:17 DEBUG preprocessing: 672 features removed.\n" - ] - } - ], + "outputs": [], "source": [ "sparse_df_stopwords_removed = pre.remove_features(sparse_bow, id_types, stopword_list)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "52926\n", - "47660\n" - ] - } - ], + "outputs": [], "source": [ "print(len(sparse_bow))\n", "print(len(sparse_df_stopwords_removed))" @@ -1754,30 +1698,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:18 INFO preprocessing: Finding stopwords ...\n", - "31-Jan-2017 14:49:18 DEBUG preprocessing: 100 stopwords found.\n" - ] - }, - { - "data": { - "text/plain": [ - "100" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stopword_list = pre.find_stopwords(sparse_bow, id_types, 100)\n", "len(stopword_list)" @@ -1785,30 +1710,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:18 INFO preprocessing: Find hapax legomena ...\n", - "31-Jan-2017 14:49:18 DEBUG preprocessing: 19478 hapax legomena found.\n" - ] - }, - { - "data": { - "text/plain": [ - "19478" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "hapax_list = pre.find_hapax(sparse_bow, id_types)\n", "len(hapax_list)" @@ -1823,20 +1729,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:18 INFO preprocessing: Removing features ...\n", - "31-Jan-2017 14:49:18 DEBUG preprocessing: 19542 features removed.\n" - ] - } - ], + "outputs": [], "source": [ "feature_list = set(stopword_list).union(hapax_list)\n", "clean_term_frequency = pre.remove_features(sparse_bow, id_types, feature_list)" @@ -1844,22 +1741,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "5685" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(clean_term_frequency)" ] @@ -1880,7 +1766,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "collapsed": false }, @@ -1903,7 +1789,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "collapsed": true }, @@ -1915,27 +1801,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:18 INFO gensim.matutils: initializing corpus reader from gb_plain.mm\n", - "31-Jan-2017 14:49:18 INFO gensim.matutils: accepted corpus with 16 documents, 24552 features, 310670 non-zero entries\n" - ] - } - ], + "outputs": [], "source": [ "mm = MmCorpus(\"gb_plain.mm\")" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "collapsed": true }, @@ -1950,7 +1827,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { "collapsed": false }, @@ -1961,7 +1838,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "collapsed": true }, @@ -1972,7 +1849,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "collapsed": true }, @@ -1983,7 +1860,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "collapsed": true }, @@ -1995,35 +1872,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 14:49:18 INFO gensim.models.ldamodel: using symmetric alpha at 0.01\n", - "31-Jan-2017 14:49:18 INFO gensim.models.ldamodel: using symmetric eta at 4.0728220584042684e-05\n", - "31-Jan-2017 14:49:18 INFO gensim.models.ldamodel: using serial LDA version on this node\n", - "31-Jan-2017 14:49:43 INFO gensim.models.ldamodel: running online LDA training, 100 topics, 1 passes over the supplied corpus of 16 documents, updating model once every 16 documents, evaluating perplexity every 16 documents, iterating 50x with a convergence threshold of 0.001000\n", - "31-Jan-2017 14:49:43 WARNING gensim.models.ldamodel: too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", - "31-Jan-2017 14:50:09 DEBUG gensim.models.ldamodel: bound: at document #0\n", - "31-Jan-2017 14:50:12 INFO gensim.models.ldamodel: -35.088 per-word bound, 36521290197.7 perplexity estimate based on a held-out corpus of 16 documents with 310670 words\n", - "31-Jan-2017 14:50:12 INFO gensim.models.ldamodel: PROGRESS: pass 0, at document #16/16\n", - "31-Jan-2017 14:50:12 DEBUG gensim.models.ldamodel: performing inference on a chunk of 16 documents\n", - "31-Jan-2017 14:50:12 DEBUG gensim.models.ldamodel: 1/16 documents converged within 50 iterations\n", - "31-Jan-2017 14:50:12 DEBUG gensim.models.ldamodel: updating topics\n", - "31-Jan-2017 14:50:13 INFO gensim.models.ldamodel: topic #0 (0.010): 0.040*\"ample\" + 0.040*\"pack\" + 0.020*\"bruyère\" + 0.019*\"fallacies\" + 0.015*\"traced\" + 0.014*\"forty\" + 0.013*\"demonstrate\" + 0.010*\"imperious\" + 0.010*\"step—we\" + 0.008*\"dishonour\"\n", - "31-Jan-2017 14:50:13 INFO gensim.models.ldamodel: topic #66 (0.010): 0.034*\"ample\" + 0.032*\"bruyère\" + 0.018*\"fallacies\" + 0.017*\"pack\" + 0.016*\"imperious\" + 0.014*\"traced\" + 0.013*\"dishonour\" + 0.012*\"demonstrate\" + 0.011*\"step—we\" + 0.009*\"forfeit\"\n", - "31-Jan-2017 14:50:13 INFO gensim.models.ldamodel: topic #38 (0.010): 0.034*\"ample\" + 0.034*\"bruyère\" + 0.033*\"pack\" + 0.015*\"traced\" + 0.013*\"fallacies\" + 0.013*\"step—we\" + 0.012*\"forty\" + 0.012*\"demonstrate\" + 0.010*\"forfeit\" + 0.010*\"imperious\"\n", - "31-Jan-2017 14:50:13 INFO gensim.models.ldamodel: topic #79 (0.010): 0.051*\"ample\" + 0.027*\"pack\" + 0.022*\"bruyère\" + 0.022*\"traced\" + 0.019*\"fallacies\" + 0.012*\"imperious\" + 0.012*\"forty\" + 0.012*\"step—we\" + 0.012*\"demonstrate\" + 0.011*\"forfeit\"\n", - "31-Jan-2017 14:50:13 INFO gensim.models.ldamodel: topic #69 (0.010): 0.063*\"ample\" + 0.033*\"bruyère\" + 0.023*\"traced\" + 0.023*\"pack\" + 0.022*\"fallacies\" + 0.017*\"imperious\" + 0.015*\"step—we\" + 0.013*\"demonstrate\" + 0.012*\"forfeit\" + 0.011*\"forty\"\n", - "31-Jan-2017 14:50:13 INFO gensim.models.ldamodel: topic diff=44.185255, rho=1.000000\n" - ] - } - ], + "outputs": [], "source": [ "#model = LdaModel(corpus=mm, id2word=type2id, num_topics=60, alpha = \"symmetric\", passes = 10) #import momentan in visual \n", "# -> da ich mir noch nicht sicher bin, welche Funktionen in das tm_gensim.py sollen\n", @@ -2032,31 +1885,11 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "['ample',\n", - " 'bruyère',\n", - " 'pack',\n", - " 'traced',\n", - " 'fallacies',\n", - " 'forfeit',\n", - " 'demonstrate',\n", - " 'forty',\n", - " 'hypothetical',\n", - " 'imperious']" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "topic_nr_x = model.get_topic_terms(10)\n", "\n", @@ -2065,7 +1898,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "collapsed": false }, @@ -2076,141 +1909,11 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "[(13,\n", - " '0.066*\"ample\" + 0.033*\"pack\" + 0.030*\"bruyère\" + 0.023*\"traced\" + 0.021*\"fallacies\" + 0.018*\"imperious\" + 0.014*\"demonstrate\" + 0.011*\"forfeit\" + 0.010*\"hypothetical\" + 0.010*\"animation\"'),\n", - " (37,\n", - " '0.041*\"ample\" + 0.021*\"pack\" + 0.020*\"bruyère\" + 0.016*\"imperious\" + 0.014*\"traced\" + 0.013*\"fallacies\" + 0.009*\"forfeit\" + 0.009*\"step—we\" + 0.009*\"forty\" + 0.008*\"demonstrate\"'),\n", - " (58,\n", - " '0.045*\"ample\" + 0.028*\"pack\" + 0.028*\"bruyère\" + 0.024*\"traced\" + 0.020*\"fallacies\" + 0.017*\"imperious\" + 0.013*\"forty\" + 0.012*\"dishonour\" + 0.011*\"step—we\" + 0.010*\"artillery\"'),\n", - " (22,\n", - " '0.053*\"ample\" + 0.030*\"pack\" + 0.026*\"bruyère\" + 0.024*\"traced\" + 0.019*\"demonstrate\" + 0.019*\"fallacies\" + 0.017*\"imperious\" + 0.015*\"forfeit\" + 0.014*\"step—we\" + 0.014*\"forty\"'),\n", - " (3,\n", - " '0.058*\"ample\" + 0.034*\"bruyère\" + 0.029*\"pack\" + 0.023*\"traced\" + 0.019*\"imperious\" + 0.013*\"fallacies\" + 0.013*\"forfeit\" + 0.011*\"demonstrate\" + 0.011*\"forty\" + 0.010*\"step—we\"'),\n", - " (95,\n", - " '0.043*\"ample\" + 0.032*\"bruyère\" + 0.022*\"demonstrate\" + 0.021*\"fallacies\" + 0.018*\"pack\" + 0.017*\"traced\" + 0.014*\"imperious\" + 0.014*\"dishonour\" + 0.012*\"step—we\" + 0.011*\"forfeit\"'),\n", - " (33,\n", - " '0.063*\"ample\" + 0.031*\"bruyère\" + 0.029*\"pack\" + 0.024*\"traced\" + 0.019*\"fallacies\" + 0.018*\"imperious\" + 0.018*\"demonstrate\" + 0.016*\"forfeit\" + 0.014*\"forty\" + 0.011*\"step—we\"'),\n", - " (99,\n", - " '0.046*\"ample\" + 0.025*\"pack\" + 0.025*\"bruyère\" + 0.022*\"traced\" + 0.022*\"fallacies\" + 0.015*\"imperious\" + 0.012*\"dishonour\" + 0.012*\"demonstrate\" + 0.012*\"forfeit\" + 0.012*\"forty\"'),\n", - " (27,\n", - " '0.059*\"ample\" + 0.034*\"pack\" + 0.032*\"bruyère\" + 0.023*\"traced\" + 0.020*\"demonstrate\" + 0.019*\"fallacies\" + 0.014*\"imperious\" + 0.011*\"forty\" + 0.011*\"forfeit\" + 0.011*\"hypothetical\"'),\n", - " (40,\n", - " '0.040*\"ample\" + 0.033*\"pack\" + 0.029*\"bruyère\" + 0.019*\"fallacies\" + 0.019*\"traced\" + 0.013*\"imperious\" + 0.013*\"demonstrate\" + 0.011*\"forty\" + 0.009*\"step—we\" + 0.009*\"forfeit\"'),\n", - " (38,\n", - " '0.034*\"ample\" + 0.034*\"bruyère\" + 0.033*\"pack\" + 0.015*\"traced\" + 0.013*\"fallacies\" + 0.013*\"step—we\" + 0.012*\"forty\" + 0.012*\"demonstrate\" + 0.010*\"forfeit\" + 0.010*\"imperious\"'),\n", - " (5,\n", - " '0.058*\"ample\" + 0.026*\"bruyère\" + 0.025*\"demonstrate\" + 0.024*\"pack\" + 0.023*\"traced\" + 0.022*\"fallacies\" + 0.014*\"imperious\" + 0.013*\"dishonour\" + 0.012*\"forty\" + 0.011*\"forfeit\"'),\n", - " (64,\n", - " '0.055*\"ample\" + 0.032*\"pack\" + 0.027*\"bruyère\" + 0.022*\"traced\" + 0.022*\"fallacies\" + 0.017*\"imperious\" + 0.013*\"demonstrate\" + 0.012*\"forfeit\" + 0.012*\"forty\" + 0.011*\"dishonour\"'),\n", - " (44,\n", - " '0.049*\"ample\" + 0.029*\"bruyère\" + 0.024*\"fallacies\" + 0.022*\"pack\" + 0.020*\"traced\" + 0.016*\"demonstrate\" + 0.014*\"forty\" + 0.012*\"imperious\" + 0.012*\"step—we\" + 0.011*\"forfeit\"'),\n", - " (88,\n", - " '0.051*\"ample\" + 0.031*\"pack\" + 0.027*\"bruyère\" + 0.026*\"traced\" + 0.021*\"fallacies\" + 0.019*\"demonstrate\" + 0.016*\"forty\" + 0.016*\"imperious\" + 0.012*\"dishonour\" + 0.012*\"forfeit\"'),\n", - " (78,\n", - " '0.047*\"ample\" + 0.032*\"pack\" + 0.029*\"traced\" + 0.020*\"bruyère\" + 0.020*\"fallacies\" + 0.019*\"forty\" + 0.017*\"imperious\" + 0.016*\"demonstrate\" + 0.012*\"step—we\" + 0.011*\"forfeit\"'),\n", - " (42,\n", - " '0.055*\"ample\" + 0.029*\"bruyère\" + 0.024*\"fallacies\" + 0.022*\"pack\" + 0.019*\"forty\" + 0.019*\"traced\" + 0.013*\"step—we\" + 0.013*\"imperious\" + 0.012*\"forfeit\" + 0.012*\"hypothetical\"'),\n", - " (75,\n", - " '0.072*\"ample\" + 0.029*\"pack\" + 0.028*\"bruyère\" + 0.021*\"fallacies\" + 0.019*\"traced\" + 0.016*\"imperious\" + 0.013*\"demonstrate\" + 0.011*\"forty\" + 0.011*\"artillery\" + 0.010*\"forfeit\"'),\n", - " (57,\n", - " '0.034*\"ample\" + 0.023*\"bruyère\" + 0.017*\"pack\" + 0.016*\"imperious\" + 0.014*\"traced\" + 0.014*\"fallacies\" + 0.010*\"forfeit\" + 0.010*\"demonstrate\" + 0.009*\"step—we\" + 0.008*\"forty\"'),\n", - " (87,\n", - " '0.057*\"ample\" + 0.031*\"pack\" + 0.021*\"bruyère\" + 0.016*\"fallacies\" + 0.016*\"imperious\" + 0.015*\"forty\" + 0.014*\"traced\" + 0.012*\"demonstrate\" + 0.011*\"forfeit\" + 0.010*\"animation\"'),\n", - " (45,\n", - " '0.040*\"ample\" + 0.030*\"pack\" + 0.026*\"bruyère\" + 0.023*\"fallacies\" + 0.021*\"traced\" + 0.018*\"demonstrate\" + 0.018*\"imperious\" + 0.013*\"forty\" + 0.012*\"hypothetical\" + 0.011*\"step—we\"'),\n", - " (56,\n", - " '0.034*\"ample\" + 0.026*\"pack\" + 0.024*\"bruyère\" + 0.013*\"fallacies\" + 0.013*\"traced\" + 0.011*\"imperious\" + 0.009*\"forfeit\" + 0.008*\"dishonour\" + 0.008*\"demonstrate\" + 0.007*\"forty\"'),\n", - " (52,\n", - " '0.046*\"ample\" + 0.037*\"bruyère\" + 0.030*\"pack\" + 0.017*\"traced\" + 0.014*\"fallacies\" + 0.014*\"imperious\" + 0.014*\"demonstrate\" + 0.013*\"forty\" + 0.012*\"step—we\" + 0.010*\"forfeit\"'),\n", - " (25,\n", - " '0.039*\"ample\" + 0.034*\"pack\" + 0.026*\"bruyère\" + 0.024*\"traced\" + 0.017*\"fallacies\" + 0.017*\"imperious\" + 0.014*\"demonstrate\" + 0.013*\"step—we\" + 0.013*\"forty\" + 0.012*\"dishonour\"'),\n", - " (18,\n", - " '0.045*\"ample\" + 0.038*\"bruyère\" + 0.033*\"pack\" + 0.026*\"traced\" + 0.018*\"imperious\" + 0.014*\"fallacies\" + 0.013*\"forfeit\" + 0.013*\"demonstrate\" + 0.011*\"step—we\" + 0.010*\"horrible\"'),\n", - " (89,\n", - " '0.063*\"ample\" + 0.024*\"bruyère\" + 0.023*\"pack\" + 0.022*\"fallacies\" + 0.019*\"traced\" + 0.016*\"demonstrate\" + 0.016*\"forty\" + 0.014*\"forfeit\" + 0.014*\"imperious\" + 0.012*\"hypothetical\"'),\n", - " (51,\n", - " '0.065*\"ample\" + 0.032*\"pack\" + 0.030*\"bruyère\" + 0.024*\"traced\" + 0.019*\"fallacies\" + 0.017*\"imperious\" + 0.013*\"forfeit\" + 0.011*\"demonstrate\" + 0.010*\"dishonour\" + 0.009*\"step—we\"'),\n", - " (0,\n", - " '0.040*\"ample\" + 0.040*\"pack\" + 0.020*\"bruyère\" + 0.019*\"fallacies\" + 0.015*\"traced\" + 0.014*\"forty\" + 0.013*\"demonstrate\" + 0.010*\"imperious\" + 0.010*\"step—we\" + 0.008*\"dishonour\"'),\n", - " (50,\n", - " '0.046*\"ample\" + 0.040*\"bruyère\" + 0.030*\"pack\" + 0.019*\"traced\" + 0.017*\"imperious\" + 0.014*\"fallacies\" + 0.013*\"dishonour\" + 0.011*\"forfeit\" + 0.011*\"forty\" + 0.009*\"step—we\"'),\n", - " (97,\n", - " '0.045*\"ample\" + 0.030*\"pack\" + 0.021*\"traced\" + 0.020*\"bruyère\" + 0.019*\"demonstrate\" + 0.016*\"forty\" + 0.014*\"fallacies\" + 0.011*\"hypothetical\" + 0.011*\"imperious\" + 0.010*\"dishonour\"'),\n", - " (14,\n", - " '0.035*\"ample\" + 0.030*\"bruyère\" + 0.022*\"pack\" + 0.018*\"traced\" + 0.015*\"fallacies\" + 0.014*\"demonstrate\" + 0.012*\"forty\" + 0.011*\"forfeit\" + 0.010*\"imperious\" + 0.010*\"dishonour\"'),\n", - " (83,\n", - " '0.052*\"ample\" + 0.025*\"pack\" + 0.022*\"bruyère\" + 0.021*\"traced\" + 0.015*\"imperious\" + 0.011*\"demonstrate\" + 0.010*\"forty\" + 0.010*\"fallacies\" + 0.009*\"artillery\" + 0.009*\"forfeit\"'),\n", - " (9,\n", - " '0.057*\"ample\" + 0.024*\"bruyère\" + 0.021*\"pack\" + 0.020*\"traced\" + 0.020*\"fallacies\" + 0.015*\"imperious\" + 0.014*\"forty\" + 0.011*\"demonstrate\" + 0.010*\"forfeit\" + 0.009*\"artillery\"'),\n", - " (77,\n", - " '0.042*\"ample\" + 0.030*\"pack\" + 0.028*\"bruyère\" + 0.024*\"traced\" + 0.018*\"demonstrate\" + 0.013*\"imperious\" + 0.013*\"forty\" + 0.012*\"fallacies\" + 0.011*\"step—we\" + 0.010*\"forfeit\"'),\n", - " (65,\n", - " '0.052*\"ample\" + 0.033*\"pack\" + 0.021*\"bruyère\" + 0.018*\"fallacies\" + 0.017*\"demonstrate\" + 0.015*\"forty\" + 0.014*\"imperious\" + 0.013*\"traced\" + 0.012*\"hypothetical\" + 0.011*\"step—we\"'),\n", - " (15,\n", - " '0.034*\"ample\" + 0.020*\"bruyère\" + 0.019*\"pack\" + 0.014*\"fallacies\" + 0.013*\"forty\" + 0.012*\"traced\" + 0.010*\"demonstrate\" + 0.009*\"imperious\" + 0.008*\"forfeit\" + 0.008*\"dishonour\"'),\n", - " (86,\n", - " '0.034*\"ample\" + 0.023*\"pack\" + 0.022*\"bruyère\" + 0.019*\"traced\" + 0.015*\"fallacies\" + 0.011*\"demonstrate\" + 0.010*\"dishonour\" + 0.010*\"imperious\" + 0.010*\"forfeit\" + 0.009*\"step—we\"'),\n", - " (59,\n", - " '0.046*\"ample\" + 0.027*\"pack\" + 0.022*\"bruyère\" + 0.019*\"traced\" + 0.018*\"demonstrate\" + 0.016*\"fallacies\" + 0.013*\"imperious\" + 0.012*\"forfeit\" + 0.012*\"forty\" + 0.011*\"step—we\"'),\n", - " (32,\n", - " '0.049*\"ample\" + 0.027*\"pack\" + 0.025*\"bruyère\" + 0.021*\"traced\" + 0.017*\"fallacies\" + 0.017*\"demonstrate\" + 0.015*\"step—we\" + 0.013*\"forty\" + 0.012*\"imperious\" + 0.009*\"artillery\"'),\n", - " (85,\n", - " '0.065*\"ample\" + 0.036*\"bruyère\" + 0.027*\"pack\" + 0.019*\"fallacies\" + 0.018*\"traced\" + 0.014*\"forfeit\" + 0.014*\"imperious\" + 0.010*\"dishonour\" + 0.010*\"step—we\" + 0.009*\"artillery\"'),\n", - " (55,\n", - " '0.038*\"ample\" + 0.023*\"bruyère\" + 0.020*\"pack\" + 0.018*\"traced\" + 0.017*\"imperious\" + 0.016*\"fallacies\" + 0.015*\"demonstrate\" + 0.011*\"step—we\" + 0.009*\"forty\" + 0.009*\"hypothetical\"'),\n", - " (2,\n", - " '0.057*\"ample\" + 0.030*\"pack\" + 0.022*\"bruyère\" + 0.018*\"fallacies\" + 0.017*\"traced\" + 0.015*\"imperious\" + 0.013*\"forty\" + 0.012*\"demonstrate\" + 0.011*\"forfeit\" + 0.011*\"step—we\"'),\n", - " (74,\n", - " '0.049*\"ample\" + 0.029*\"bruyère\" + 0.025*\"pack\" + 0.019*\"traced\" + 0.019*\"fallacies\" + 0.016*\"imperious\" + 0.011*\"forty\" + 0.011*\"forfeit\" + 0.010*\"demonstrate\" + 0.010*\"step—we\"'),\n", - " (48,\n", - " '0.038*\"ample\" + 0.032*\"pack\" + 0.030*\"traced\" + 0.024*\"bruyère\" + 0.020*\"fallacies\" + 0.016*\"demonstrate\" + 0.016*\"forty\" + 0.016*\"imperious\" + 0.015*\"step—we\" + 0.012*\"dishonour\"'),\n", - " (54,\n", - " '0.053*\"ample\" + 0.031*\"pack\" + 0.031*\"bruyère\" + 0.019*\"traced\" + 0.018*\"fallacies\" + 0.016*\"demonstrate\" + 0.015*\"imperious\" + 0.013*\"forty\" + 0.013*\"step—we\" + 0.011*\"forfeit\"'),\n", - " (1,\n", - " '0.059*\"ample\" + 0.031*\"pack\" + 0.030*\"bruyère\" + 0.019*\"traced\" + 0.018*\"fallacies\" + 0.013*\"forty\" + 0.013*\"demonstrate\" + 0.011*\"imperious\" + 0.010*\"dishonour\" + 0.010*\"hypothetical\"'),\n", - " (93,\n", - " '0.080*\"ample\" + 0.040*\"bruyère\" + 0.039*\"pack\" + 0.023*\"fallacies\" + 0.019*\"traced\" + 0.014*\"imperious\" + 0.014*\"forfeit\" + 0.011*\"step—we\" + 0.011*\"demonstrate\" + 0.011*\"forty\"'),\n", - " (7,\n", - " '0.057*\"ample\" + 0.033*\"pack\" + 0.027*\"bruyère\" + 0.023*\"traced\" + 0.020*\"imperious\" + 0.016*\"fallacies\" + 0.014*\"demonstrate\" + 0.013*\"forfeit\" + 0.011*\"step—we\" + 0.010*\"dishonour\"'),\n", - " (41,\n", - " '0.040*\"ample\" + 0.031*\"bruyère\" + 0.021*\"pack\" + 0.021*\"traced\" + 0.018*\"fallacies\" + 0.018*\"demonstrate\" + 0.013*\"imperious\" + 0.013*\"forty\" + 0.012*\"dishonour\" + 0.010*\"step—we\"'),\n", - " (81,\n", - " '0.045*\"ample\" + 0.028*\"pack\" + 0.027*\"bruyère\" + 0.025*\"fallacies\" + 0.024*\"traced\" + 0.016*\"demonstrate\" + 0.015*\"imperious\" + 0.013*\"forfeit\" + 0.013*\"step—we\" + 0.012*\"forty\"'),\n", - " (68,\n", - " '0.060*\"ample\" + 0.029*\"bruyère\" + 0.027*\"pack\" + 0.021*\"traced\" + 0.020*\"fallacies\" + 0.017*\"imperious\" + 0.014*\"forty\" + 0.013*\"demonstrate\" + 0.011*\"step—we\" + 0.011*\"animation\"'),\n", - " (61,\n", - " '0.063*\"ample\" + 0.039*\"bruyère\" + 0.028*\"traced\" + 0.027*\"pack\" + 0.018*\"fallacies\" + 0.016*\"forfeit\" + 0.014*\"demonstrate\" + 0.013*\"imperious\" + 0.011*\"step—we\" + 0.010*\"dishonour\"'),\n", - " (20,\n", - " '0.046*\"ample\" + 0.027*\"pack\" + 0.024*\"traced\" + 0.021*\"fallacies\" + 0.020*\"bruyère\" + 0.014*\"forty\" + 0.014*\"imperious\" + 0.013*\"demonstrate\" + 0.010*\"step—we\" + 0.010*\"forfeit\"'),\n", - " (17,\n", - " '0.054*\"ample\" + 0.032*\"bruyère\" + 0.031*\"traced\" + 0.028*\"pack\" + 0.017*\"fallacies\" + 0.016*\"demonstrate\" + 0.015*\"imperious\" + 0.012*\"forty\" + 0.010*\"forfeit\" + 0.010*\"dishonour\"'),\n", - " (94,\n", - " '0.041*\"ample\" + 0.036*\"pack\" + 0.025*\"bruyère\" + 0.021*\"traced\" + 0.020*\"demonstrate\" + 0.017*\"fallacies\" + 0.015*\"imperious\" + 0.013*\"forfeit\" + 0.012*\"dishonour\" + 0.010*\"forty\"'),\n", - " (47,\n", - " '0.044*\"ample\" + 0.031*\"bruyère\" + 0.029*\"pack\" + 0.021*\"traced\" + 0.017*\"fallacies\" + 0.016*\"demonstrate\" + 0.014*\"dishonour\" + 0.014*\"forty\" + 0.013*\"hypothetical\" + 0.012*\"imperious\"'),\n", - " (80,\n", - " '0.043*\"ample\" + 0.029*\"bruyère\" + 0.027*\"pack\" + 0.020*\"fallacies\" + 0.018*\"traced\" + 0.015*\"forty\" + 0.015*\"dishonour\" + 0.015*\"demonstrate\" + 0.013*\"imperious\" + 0.013*\"forfeit\"'),\n", - " (24,\n", - " '0.038*\"ample\" + 0.031*\"pack\" + 0.019*\"traced\" + 0.017*\"bruyère\" + 0.017*\"fallacies\" + 0.017*\"demonstrate\" + 0.014*\"forty\" + 0.012*\"step—we\" + 0.011*\"imperious\" + 0.010*\"forfeit\"'),\n", - " (66,\n", - " '0.034*\"ample\" + 0.032*\"bruyère\" + 0.018*\"fallacies\" + 0.017*\"pack\" + 0.016*\"imperious\" + 0.014*\"traced\" + 0.013*\"dishonour\" + 0.012*\"demonstrate\" + 0.011*\"step—we\" + 0.009*\"forfeit\"'),\n", - " (73,\n", - " '0.046*\"ample\" + 0.029*\"pack\" + 0.022*\"bruyère\" + 0.019*\"fallacies\" + 0.014*\"imperious\" + 0.013*\"traced\" + 0.013*\"forty\" + 0.012*\"demonstrate\" + 0.010*\"forfeit\" + 0.009*\"dishonour\"')]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "topics" ] @@ -2231,21 +1934,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 15:42:36 DEBUG mallet: tutorial_supplementals/mallet_output/malletBinary.mallet\n", - "31-Jan-2017 15:42:36 INFO mallet: Accessing Mallet ...\n", - "31-Jan-2017 15:42:37 DEBUG mallet: Mallet file available.\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "\n", @@ -2268,22 +1961,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "31-Jan-2017 15:42:40 DEBUG mallet: /home/sina/Uni/Dariah/DARIAH-Topics/Topics/tutorial_supplementals/mallet_output\n", - "31-Jan-2017 15:42:40 INFO mallet: Accessing Mallet ...\n", - "31-Jan-2017 15:42:40 DEBUG mallet: Mallet file available.\n" - ] - } - ], + "outputs": [], "source": [ "outfolder = os.path.join(os.path.abspath('.'), \"tutorial_supplementals/mallet_output\")\n", "mal.create_mallet_model(malletBinary,outfolder)" @@ -2298,7 +1981,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "collapsed": false }, @@ -2310,103 +1993,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 1.84700883e-02 1.22589081e-02 6.97940503e-02 8.33605754e-03\n", - " 2.33736515e-02 3.80353057e-01 7.02844067e-03 8.87544949e-02\n", - " 3.75939850e-03 5.83524027e-02 8.33605754e-03 3.77574371e-02\n", - " 9.97057862e-03 8.17260543e-04 4.74011115e-03 1.78162798e-02\n", - " 1.45635829e-01 9.26773455e-02 3.43249428e-03 8.33605754e-03]\n", - " [ 2.27659172e-02 2.05617482e-02 9.19138485e-02 1.10523333e-02\n", - " 1.52087663e-02 1.58290824e-01 4.12494490e-03 9.82744505e-02\n", - " 1.99319856e-02 7.82480005e-02 6.14018515e-03 2.69223503e-02\n", - " 6.26613767e-03 1.98375213e-03 1.66887084e-03 2.09301593e-01\n", - " 1.42924617e-01 7.52251401e-02 4.75470748e-03 4.43982619e-03]\n", - " [ 7.41623304e-03 1.90876818e-02 1.06866702e-01 2.84491563e-03\n", - " 2.79409619e-01 1.15717551e-01 3.33122599e-03 8.66361912e-02\n", - " 1.67047610e-02 5.32266693e-02 7.51349511e-03 2.39021544e-02\n", - " 5.76277780e-03 5.59256918e-04 1.09419832e-03 1.33978505e-02\n", - " 1.70621991e-01 7.99737392e-02 3.03943977e-03 2.89354666e-03]\n", - " [ 1.53721157e-02 2.64218395e-02 8.01104972e-02 2.17192070e-01\n", - " 3.12967176e-02 1.38739032e-01 3.08742281e-03 8.27104322e-02\n", - " 1.42021449e-02 7.99805005e-02 6.20734482e-03 2.21969451e-02\n", - " 5.88235294e-03 1.72245694e-03 1.85245369e-03 1.67370816e-02\n", - " 1.79948001e-01 6.81507962e-02 4.06239844e-03 4.12739682e-03]\n", - " [ 2.20680958e-03 5.36885246e-01 1.31462799e-01 4.09836066e-03\n", - " 5.98991173e-03 2.20680958e-03 7.21941992e-02 1.29255990e-02\n", - " 3.05800757e-02 5.39092055e-02 5.98991173e-03 6.52585120e-02\n", - " 2.20680958e-03 5.35939470e-03 6.62042875e-03 7.25094578e-03\n", - " 4.12988651e-02 6.62042875e-03 1.57629256e-03 5.35939470e-03]\n", - " [ 9.87193170e-03 1.07346140e-01 1.68356457e-01 5.60298826e-03\n", - " 1.51191747e-03 1.14727855e-02 3.77534685e-01 1.69868374e-02\n", - " 1.84098186e-02 7.94201352e-02 5.42511562e-03 1.12326574e-01\n", - " 7.55958734e-03 2.93489861e-03 4.35787976e-03 5.06937033e-03\n", - " 5.32728566e-02 1.11170402e-02 4.44681608e-04 9.78299538e-04]\n", - " [ 5.49450549e-03 2.37704918e-01 1.69789227e-01 3.51288056e-03\n", - " 3.87317600e-03 8.37686903e-03 2.49954963e-01 1.84651414e-02\n", - " 2.13475050e-02 7.05278328e-02 7.83642587e-03 1.08719150e-01\n", - " 2.25184651e-03 9.63790308e-03 8.55701675e-03 3.33273284e-03\n", - " 5.68366060e-02 1.01783462e-02 2.07169879e-03 1.53125563e-03]\n", - " [ 3.60576923e-03 1.35262574e-01 1.64108728e-01 2.31139053e-03\n", - " 4.62278107e-04 1.54400888e-02 3.28309911e-01 2.98631657e-02\n", - " 4.72448225e-02 5.59356509e-02 1.26664201e-02 1.00314349e-01\n", - " 1.94156805e-03 6.74926036e-03 1.93232249e-02 1.38683432e-03\n", - " 5.70451183e-02 1.48853550e-02 1.75665680e-03 1.38683432e-03]\n", - " [ 4.04976620e-01 2.35470942e-02 8.60053440e-02 7.51503006e-03\n", - " 7.84903140e-03 4.25851703e-02 8.18303273e-03 4.99331997e-02\n", - " 1.35270541e-02 1.22077488e-01 6.84702739e-03 2.28790915e-02\n", - " 7.84903140e-03 1.61990648e-02 6.84702739e-03 3.84101536e-03\n", - " 1.36773547e-01 2.68871075e-02 3.17301269e-03 2.50501002e-03]\n", - " [ 1.11689702e-02 5.88135098e-02 6.35828304e-02 2.11699435e-03\n", - " 3.64999027e-04 1.08769710e-02 3.23632470e-03 9.90364026e-03\n", - " 3.04165856e-03 2.32115048e-01 2.16566089e-03 5.81565116e-03\n", - " 5.10998637e-04 4.19773214e-01 1.19233015e-03 1.67899552e-03\n", - " 1.64857894e-01 5.08565310e-03 2.79832587e-03 9.00330932e-04]\n", - " [ 1.11578780e-02 2.09259848e-02 3.83973952e-02 3.05749682e-03\n", - " 9.92693774e-04 2.71998094e-02 1.94567980e-03 2.57703304e-02\n", - " 6.63119441e-03 1.62523825e-01 2.66041931e-03 8.14008895e-03\n", - " 7.34593393e-03 6.23411690e-03 4.36785260e-04 1.94567980e-03\n", - " 1.39413914e-01 1.63198856e-02 5.15208069e-01 3.69282084e-03]\n", - " [ 1.59848280e-03 9.56380385e-03 3.87157952e-02 9.48252506e-04\n", - " 1.59848280e-03 4.52451910e-03 1.76104037e-03 1.14684367e-01\n", - " 2.17583311e-01 1.74207532e-02 6.10945543e-02 6.20157139e-02\n", - " 5.33730696e-03 1.11081008e-03 3.38797074e-01 7.31509076e-04\n", - " 6.81387158e-02 4.61934435e-02 1.00243836e-03 7.17962612e-03]\n", - " [ 1.28627290e-02 2.41819304e-02 4.72319407e-02 8.54085203e-03\n", - " 8.54085203e-03 4.45564931e-02 1.74933114e-03 7.13109693e-02\n", - " 1.82239144e-01 2.70631817e-02 5.48466763e-02 1.03416341e-01\n", - " 2.68470879e-01 1.95513480e-03 7.30603005e-03 9.98147767e-03\n", - " 7.06935583e-02 4.47622968e-02 3.39576044e-03 6.89442272e-03]\n", - " [ 1.20440468e-03 1.68616655e-03 8.91259463e-03 1.06675843e-03\n", - " 5.84996559e-04 1.54852030e-03 6.53819683e-04 3.95388851e-02\n", - " 2.36407433e-02 3.20027529e-03 2.69270475e-01 6.22849277e-03\n", - " 2.16792842e-03 5.16173434e-04 5.16173434e-04 1.20440468e-03\n", - " 2.29525120e-02 2.03888507e-01 2.40880936e-04 4.10977288e-01]\n", - " [ 1.02813853e-02 2.65151515e-02 6.54761905e-02 1.35281385e-02\n", - " 1.13636364e-02 3.95021645e-02 7.03463203e-03 5.35714286e-02\n", - " 2.86796537e-02 6.22294372e-02 7.03463203e-03 7.62987013e-02\n", - " 2.92748918e-01 8.11688312e-03 1.78571429e-02 2.32683983e-02\n", - " 1.26082251e-01 1.10930736e-01 5.95238095e-03 1.35281385e-02]\n", - " [ 1.03652517e-02 7.15695953e-02 8.93385982e-02 2.46791708e-03\n", - " 5.42941757e-03 6.36722606e-02 1.23395854e-02 2.22112537e-02\n", - " 2.91214215e-02 1.92497532e-02 1.13524186e-02 4.34846989e-01\n", - " 1.82625864e-02 5.42941757e-03 4.44225074e-03 7.40375123e-03\n", - " 8.24284304e-02 6.66337611e-02 9.37808490e-03 3.40572557e-02]\n", - " [ 7.47986191e-03 3.64403529e-03 2.93440736e-02 5.56194860e-03\n", - " 3.26045263e-03 9.49367089e-02 5.17836594e-03 1.21403913e-01\n", - " 1.20828539e-02 2.81933257e-02 1.70694285e-02 1.74530111e-02\n", - " 2.92481780e-01 2.87686997e-03 4.79478328e-03 5.56194860e-03\n", - " 7.57575758e-02 1.91983122e-01 4.02761795e-03 7.69083237e-02]]\n" - ] - } - ], + "outputs": [], "source": [ "mal.create_MalletMatrix(doc_topics)" ] @@ -2537,7 +2128,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.5.1" } }, "nbformat": 4, diff --git a/dariah_topics/preprocessing.py b/dariah_topics/preprocessing.py index b787b3f..a2ba37a 100644 --- a/dariah_topics/preprocessing.py +++ b/dariah_topics/preprocessing.py @@ -386,7 +386,7 @@ def create_mm(doc_labels, doc_tokens, type_dictionary, doc_ids): index_iterator = sparse_index.groupby(sparse_index.get_level_values('doc_id')) - for doc_id in range(1, len(sparse_index.levels[0])): + for doc_id in range(1, len(sparse_index.levels[0])+1): for token_id in [val[1] for val in index_iterator[doc_id]]: sparse_df_filled.set_value((doc_id, token_id), 0, int(largecounter[doc_id][token_id]))