# Testing `collection.py`

In [6]:
import collection
# Warning is Gensim related

In [7]:
path_txt = "corpus_txt"
path_csv = "corpus_csv"

##### Liste mit Dateinamen erzeugen

In [8]:
doclist_txt = collection.create_document_list(path_txt)
doclist_txt[:5]

['corpus_txt\\Doyle_AScandalinBohemia.txt',
 'corpus_txt\\Doyle_AStudyinScarlet.txt',
 'corpus_txt\\Doyle_TheHoundoftheBaskervilles.txt',
 'corpus_txt\\Doyle_TheSignoftheFour.txt',
 'corpus_txt\\Howard_GodsoftheNorth.txt']

In [9]:
doclist_csv = collection.create_document_list(path_csv, 'csv')
doclist_csv[:5]

['corpus_csv\\Doyle_AStudyinScarlet.txt.csv',
 'corpus_csv\\Doyle_TheHoundoftheBaskervilles.txt.csv',
 'corpus_csv\\Doyle_TheSignoftheFour.txt.csv',
 'corpus_csv\\Howard_GodsoftheNorth.txt.csv',
 'corpus_csv\\Howard_SchadowsinZamboula.txt.csv']

##### Corpus laden

In [10]:
corpus_txt = collection.read_from_txt(doclist_txt)

In [11]:
corpus_csv = collection.read_from_csv(doclist_csv)

##### Segmenter

In [12]:
segments = collection.segmenter(corpus_txt, 1000)
next(segments)

"A SCANDAL IN BOHEMIA\n\nA. CONAN DOYLE\n\n\nI\n\nTo Sherlock Holmes she is always _the_ woman. I have seldom heard him\nmention her under any other name. In his eyes she eclipses and\npredominates the whole of her sex. It was not that he felt any emotion\nakin to love for Irene Adler. All emotions, and that one particularly,\nwere abhorrent to his cold, precise but admirably balanced mind. He was,\nI take it, the most perfect reasoning and observing machine that the\nworld has seen; but as a lover, he would have placed himself in a false\nposition. He never spoke of the softer passions, save with a gibe and a\nsneer. They were admirable things for the observer--excellent for\ndrawing the veil from men's motives and actions. But for the trained\nreasoner to admit such intrusions into his own delicate and finely\nadjusted temperament was to introduce a distracting factor which might\nthrow a doubt upon all his mental results. Grit in a sensitive\ninstrument, or a crack in one of his own

##### Counter erstellen um removeStopwords und removeHapax zu verwenden

In [13]:
from collections import Counter

counter = Counter()

for doc in corpus_txt:
    #split() immer noch, da kein Tokenizer vorhanden und nur temporär zum Testen
    counter.update(doc.split())
    
#
# Grenzbote als Counter ca. 100MB groß
#

In [43]:
#
# Kopie des ursprünglichen Counters um im Testvorgang den Counter zurückzusetzen
# Auf diese Weise muss der Counter nicht jedes Mal erneut erstellt werden,
# sondern kann hier zurückgesetzt werden
#

countercopy = counter.copy()

##### Remove Stopwords

In [40]:
dict_without_stopwords = collection.removeStopwords(countercopy, 20)

In [42]:
print("Länge des counters: ", len(counter))
print("Länge des dicts nachdem Stopwords entfernt wurden: ", len(dict_without_stopwords))
print("25 MFWs von counter:\n", counter.most_common(25), "\n")
print("25 MFWs nachdem stopwords entfernt wurden:\n", dict_without_stopwords.most_common(25))

Länge des counters:  43989
Länge des dicts nachdem Stopwords entfernt wurden:  43969
25 MFWs von counter:
 [('the', 21357), ('of', 11614), ('and', 11040), ('to', 8516), ('a', 7652), ('in', 5585), ('I', 5393), ('that', 4479), ('was', 4211), ('he', 3610), ('his', 3503), ('had', 2816), ('is', 2768), ('with', 2767), ('as', 2733), ('it', 2457), ('for', 2282), ('at', 2272), ('have', 2171), ('which', 2063), ('we', 1963), ('you', 1905), ('not', 1902), ('my', 1833), ('be', 1738)] 

25 MFWs nachdem stopwords entfernt wurden:
 [('we', 1963), ('you', 1905), ('not', 1902), ('my', 1833), ('be', 1738), ('on', 1729), ('from', 1670), ('but', 1462), ('were', 1449), ('The', 1424), ('all', 1400), ('by', 1351), ('this', 1284), ('said', 1251), ('He', 1250), ('him', 1082), ('or', 1068), ('an', 1053), ('are', 1042), ('been', 1037), ('our', 1018), ('one', 1018), ('no', 1006), ('me', 978), ('upon', 957)]


##### Remove Hapax

In [44]:
dict_without_hapax = collection.removeHapax(countercopy)

In [45]:
print("Länge des counters: ", len(counter))
print("Länge des dicts nachdem Hapax entfernt wurden: ",len(dict_without_hapax))

print("Anzahl der Wörter, die öfter als ein Mal vorkommen: ", len([count for count in dict_without_hapax.values() if count > 1]))
print("Anzahl der Wörter, die genau ein Mal vorkommen: ",len([count for count in dict_without_hapax.values() if count == 1]))

Länge des counters:  43989
Länge des dicts nachdem Hapax entfernt wurden:  17893
Anzahl der Wörter, die öfter als ein Mal vorkommen:  17893
Anzahl der Wörter, die genau ein Mal vorkommen:  0


##### Lemmas anhand POS-Tags auswählen

In [8]:
lemmas = collection.filter_POS_tags(corpus_csv)
next(lemmas)

31-Oct-2016 15:19:26 INFO collection: Accessing ['ADJ', 'V', 'NN'] lemmas ...
31-Oct-2016 15:19:26 INFO collection: Accessing CSV documents ...


37       typographical
56             textual
59              square
72                 old
75                such
80             present
112           original
122           original
139              ascii
147            latin-1
154            present
161             french
163            spanish
169             proper
294             second
320               deep
334              other
340               same
365                new
406              fatal
430         subclavian
442          murderous
458            orderly
475            british
483               weak
486          prolonged
499              great
525               able
537             little
548            enteric
             ...      
50767     irresistible
50813           sudden
50817           likely
50833            least
50870         original
50878             arab
50879        detective
50918            fresh
50933       unexpected
50973            whole
50979          logical
50991        wonderful
51095      

In [9]:
labels = collection.get_labels(doclist_txt)
list(labels)

31-Oct-2016 15:19:26 INFO collection: Creating document labels ...
31-Oct-2016 15:19:26 DEBUG collection: Document labels available.


['Doyle_AScandalinBohemia.txt',
 'Doyle_AStudyinScarlet.txt',
 'Doyle_TheHoundoftheBaskervilles.txt',
 'Doyle_TheSignoftheFour.txt',
 'Howard_GodsoftheNorth.txt',
 'Howard_SchadowsinZamboula.txt',
 'Howard_ShadowsintheMoonlight.txt',
 'Howard_TheDevilinIron.txt',
 'Kipling_TheEndofthePassage.txt',
 'Kipling_TheJungleBook.txt',
 'Kipling_ThyServantaDog.txt',
 'Lovecraft_AttheMountainofMadness.txt',
 'Lovecraft_TheShunnedHouse.txt',
 'Poe_EurekaAProsePoem.txt',
 'Poe_TheCaskofAmontillado.txt',
 'Poe_TheMasqueoftheRedDeath.txt',
 'Poe_ThePurloinedLetter.txt']

##### Visualisierung

In [10]:
# Da `out_grenzboten` derzeit kaputt ist, wird vorübergehend das
# model aus `out_easy` verwendet (stammt aus IntroductionTopics.ipynb)

lda_model = 'out_easy/corpus.lda'
corpus = 'out_easy/corpus.mm'
dictionary = 'out_easy/corpus.dict'
doc_labels = 'out_easy/corpus_doclabels.txt'
interactive  = False

vis = collection.Visualization(lda_model, corpus, dictionary, doc_labels, interactive)

31-Oct-2016 15:19:26 INFO collection: Accessing corpus ...
31-Oct-2016 15:19:26 INFO gensim.corpora.indexedcorpus: loaded corpus index from out_easy/corpus.mm.index
31-Oct-2016 15:19:26 INFO gensim.matutils: initializing corpus reader from out_easy/corpus.mm
31-Oct-2016 15:19:26 INFO gensim.matutils: accepted corpus with 17 documents, 514 features, 4585 non-zero entries
31-Oct-2016 15:19:26 DEBUG collection: Corpus available.
31-Oct-2016 15:19:26 INFO collection: Accessing model ...
31-Oct-2016 15:19:26 INFO gensim.utils: loading LdaModel object from out_easy/corpus.lda
31-Oct-2016 15:19:26 INFO gensim.utils: loading id2word recursively from out_easy/corpus.lda.id2word.* with mmap=None
31-Oct-2016 15:19:26 INFO gensim.utils: setting ignored attribute state to None
31-Oct-2016 15:19:26 INFO gensim.utils: setting ignored attribute dispatcher to None
31-Oct-2016 15:19:26 INFO gensim.utils: loading LdaModel object from out_easy/corpus.lda.state
31-Oct-2016 15:19:26 DEBUG collection: Model 

In [11]:
heatmap = vis.make_heatmap()

31-Oct-2016 15:19:26 INFO collection: Accessing topic distribution ...
31-Oct-2016 15:19:26 DEBUG collection: Topic distribution available.
31-Oct-2016 15:19:26 INFO collection: Accessing topic probability ...
31-Oct-2016 15:19:26 DEBUG collection: Topic probability available.
31-Oct-2016 15:19:26 INFO collection: Accessing plot labels ...
31-Oct-2016 15:19:26 DEBUG collection: 10 plot labels available.
31-Oct-2016 15:19:26 INFO collection: Creating heatmap figure ...
31-Oct-2016 15:19:27 DEBUG collection: Heatmap figure available.


In [12]:
vis.save_heatmap("./visualizations/heatmap")

31-Oct-2016 15:19:27 INFO collection: Saving heatmap figure...
31-Oct-2016 15:19:27 DEBUG collection: Heatmap figure available at ./visualizations/heatmap/heatmap.png


In [13]:
vis = collection.Visualization(lda_model, corpus, dictionary, doc_labels, interactive=True)

31-Oct-2016 15:19:27 INFO collection: Accessing corpus ...
31-Oct-2016 15:19:27 INFO gensim.corpora.indexedcorpus: loaded corpus index from out_easy/corpus.mm.index
31-Oct-2016 15:19:27 INFO gensim.matutils: initializing corpus reader from out_easy/corpus.mm
31-Oct-2016 15:19:27 INFO gensim.matutils: accepted corpus with 17 documents, 514 features, 4585 non-zero entries
31-Oct-2016 15:19:27 DEBUG collection: Corpus available.
31-Oct-2016 15:19:27 INFO collection: Accessing model ...
31-Oct-2016 15:19:27 INFO gensim.utils: loading LdaModel object from out_easy/corpus.lda
31-Oct-2016 15:19:27 INFO gensim.utils: loading id2word recursively from out_easy/corpus.lda.id2word.* with mmap=None
31-Oct-2016 15:19:27 INFO gensim.utils: setting ignored attribute state to None
31-Oct-2016 15:19:27 INFO gensim.utils: setting ignored attribute dispatcher to None
31-Oct-2016 15:19:27 INFO gensim.utils: loading LdaModel object from out_easy/corpus.lda.state
31-Oct-2016 15:19:27 DEBUG collection: Model 

In [14]:
vis.make_interactive()

31-Oct-2016 15:19:27 INFO collection: Accessing model, corpus and dictionary ...
31-Oct-2016 15:19:27 DEBUG gensim.models.ldamodel: performing inference on a chunk of 17 documents
31-Oct-2016 15:19:27 DEBUG gensim.models.ldamodel: 6/17 documents converged within 50 iterations
31-Oct-2016 15:19:28 DEBUG collection: Interactive visualization available.


PreparedData(topic_coordinates=            Freq  cluster  topics         x         y
topic                                                
2      44.980606        1       1  0.004107 -0.022531
5      13.908246        1       2  0.037390  0.080774
8      10.953661        1       3  0.089735  0.062741
9       7.385339        1       4  0.027546  0.065944
6       7.324317        1       5 -0.069999  0.046055
1       6.010591        1       6 -0.229902  0.005372
4       4.780237        1       7  0.032937 -0.138387
3       2.421148        1       8  0.003393 -0.009874
0       2.231058        1       9  0.079083  0.018599
7       0.004798        1      10  0.025711 -0.108691, topic_info=     Category        Freq     Term       Total  loglift  logprob
term                                                            
474   Default  484.000000   holmes  484.000000  30.0000  30.0000
119   Default  986.000000      man  986.000000  29.0000  29.0000
38    Default  303.000000    house  303.000000  2

In [15]:
vis.save_interactive("./visualizations/interactive")

31-Oct-2016 15:19:28 INFO collection: Saving interactive visualization ...
31-Oct-2016 15:19:28 DEBUG collection: Interactive visualization available at ./visualizations/interactive/corpus_interactive.html and ./visualizations/interactive/corpus_interactive.json


![success](http://cdn2.hubspot.net/hub/128506/file-446943132-jpg/images/computer_woman_success.jpg)