# Testing `collection.py`

In [2]:
import collection

In [3]:
path_txt = "corpus"
path_csv = "corpus_csv"

##### Liste mit Dateinamen erzeugen

In [4]:
doclist_txt = collection.create_document_list(path_txt)
doclist_txt[:5]

['corpus/Doyle_AScandalinBohemia.txt',
 'corpus/Doyle_AStudyinScarlet.txt',
 'corpus/Doyle_TheHoundoftheBaskervilles.txt',
 'corpus/Doyle_TheSignoftheFour.txt',
 'corpus/Howard_GodsoftheNorth.txt']

In [5]:
help(collection.create_document_list)

Help on function create_document_list in module collection:

create_document_list(path, ext='.txt')
    Creates a list of files with their full path.
    
    Args:
        path (str): Path to folder, e.g. '/tmp/corpus'.
        suffix (str): File extension, e.g. '.csv'. Defaults to '.txt'.
    
    Returns:
        list[str]: List of files with full path.



##### Corpus laden

In [6]:
corpus = collection.ReadFromTXT(doclist_txt)
corpus.__dict__

{'doclist': ['corpus/Doyle_AScandalinBohemia.txt',
  'corpus/Doyle_AStudyinScarlet.txt',
  'corpus/Doyle_TheHoundoftheBaskervilles.txt',
  'corpus/Doyle_TheSignoftheFour.txt',
  'corpus/Howard_GodsoftheNorth.txt',
  'corpus/Howard_SchadowsinZamboula.txt',
  'corpus/Howard_ShadowsintheMoonlight.txt',
  'corpus/Howard_TheDevilinIron.txt',
  'corpus/Kipling_TheEndofthePassage.txt',
  'corpus/Kipling_TheJungleBook.txt',
  'corpus/Kipling_ThyServantaDog.txt',
  'corpus/Lovecraft_AttheMountainofMadness.txt',
  'corpus/Lovecraft_TheShunnedHouse.txt',
  'corpus/Poe_EurekaAProsePoem.txt',
  'corpus/Poe_TheCaskofAmontillado.txt',
  'corpus/Poe_TheMasqueoftheRedDeath.txt',
  'corpus/Poe_ThePurloinedLetter.txt']}

##### Segmenter

In [7]:
segments = []

for document in list(corpus):
   segments.append(collection.Segmenter(document, 1000))

for s in list(segments[1])[:5]:
    print(s[:7])
    print(len(s))

['A', 'STUDY', 'IN', 'SCARLET.', 'By', 'A.', 'Conan']
1000
['to', 'go', 'halves', 'with', 'him', 'in', 'some']
1000
['small', 'quantity', 'of', 'blood', 'to', 'a', 'litre']
1000
['large', 'airy', 'sitting-room,', 'cheerfully', 'furnished,', 'and', 'illuminated']
1000
['the', 'moon', 'it', 'would', 'not', 'make', 'a']
1000


##### Liste mit CSV-Dateien erzeugen

In [8]:
doclist_csv = collection.create_document_list(path_csv, ext='.csv')
doclist_csv[:5]

['corpus_csv/Doyle_AStudyinScarlet.txt.csv',
 'corpus_csv/Doyle_TheHoundoftheBaskervilles.txt.csv',
 'corpus_csv/Doyle_TheSignoftheFour.txt.csv',
 'corpus_csv/Howard_GodsoftheNorth.txt.csv',
 'corpus_csv/Howard_SchadowsinZamboula.txt.csv']

##### POS-Tags auswählen

In [9]:
pos_tags = ['ADJ', 'V', 'NN']

corpusCSV = collection.FilterPOS(doclist_csv, pos_tags)
corpusCSV.__dict__

{'columns': ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'],
 'doc': Empty DataFrame
 Columns: []
 Index: [],
 'files': ['corpus_csv/Doyle_AStudyinScarlet.txt.csv',
  'corpus_csv/Doyle_TheHoundoftheBaskervilles.txt.csv',
  'corpus_csv/Doyle_TheSignoftheFour.txt.csv',
  'corpus_csv/Howard_GodsoftheNorth.txt.csv',
  'corpus_csv/Howard_SchadowsinZamboula.txt.csv',
  'corpus_csv/Howard_ShadowsintheMoonlight.txt.csv',
  'corpus_csv/Howard_TheDevilinIron.txt.csv',
  'corpus_csv/Kipling_TheEndofthePassage.txt.csv',
  'corpus_csv/Kipling_TheJungleBook.txt.csv',
  'corpus_csv/Kipling_ThyServantaDog.txt.csv',
  'corpus_csv/Lovecraft_AttheMountainofMadness.txt.csv',
  'corpus_csv/Lovecraft_TheShunnedHouse.txt.csv',
  'corpus_csv/Poe_EurekaAProsePoem.txt.csv',
  'corpus_csv/Poe_TheCaskofAmontillado.txt.csv',
  'corpus_csv/Poe_TheMasqueoftheRedDeath.txt.csv',
  'corpus_csv/Poe_ThePurloinedLetter.txt.csv'],
 'labels': [],
 'pos_tags': ['ADJ', 'V', 'NN']}

In [10]:
corpusCSV.columns

['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

In [11]:
labels = corpusCSV.get_labels()
list(labels)

['Doyle_AStudyinScarlet.txt.csv',
 'Doyle_TheHoundoftheBaskervilles.txt.csv',
 'Doyle_TheSignoftheFour.txt.csv',
 'Howard_GodsoftheNorth.txt.csv',
 'Howard_SchadowsinZamboula.txt.csv',
 'Howard_ShadowsintheMoonlight.txt.csv',
 'Howard_TheDevilinIron.txt.csv',
 'Kipling_TheEndofthePassage.txt.csv',
 'Kipling_TheJungleBook.txt.csv',
 'Kipling_ThyServantaDog.txt.csv',
 'Lovecraft_AttheMountainofMadness.txt.csv',
 'Lovecraft_TheShunnedHouse.txt.csv',
 'Poe_EurekaAProsePoem.txt.csv',
 'Poe_TheCaskofAmontillado.txt.csv',
 'Poe_TheMasqueoftheRedDeath.txt.csv',
 'Poe_ThePurloinedLetter.txt.csv']

##### Lemma

In [12]:
lemma = corpusCSV.get_lemma()
list(lemma)[0]

37       typographical
56             textual
59              square
72                 old
75                such
80             present
112           original
122           original
139              ascii
147            latin-1
154            present
161             french
163            spanish
169             proper
294             second
320               deep
334              other
340               same
365                new
406              fatal
430         subclavian
442          murderous
458            orderly
475            british
483               weak
486          prolonged
499              great
525               able
537             little
548            enteric
             ...      
50767     irresistible
50813           sudden
50817           likely
50833            least
50870         original
50878             arab
50879        detective
50918            fresh
50933       unexpected
50973            whole
50979          logical
50991        wonderful
51095      

##### Visualisierung

In [13]:
# Ingest Gensim stuff

lda_model = 'out_off/corpus_off.lda'
corpus = 'out_off/corpus_off.mm'
dictionary = 'out_off/corpus_off.dict'
doc_labels = 'out_off/corpus_off_doclabels.txt'
interactive  = False

vis = collection.Visualization(lda_model, corpus, dictionary, doc_labels, interactive)

17-Oct-2016 17:43:56 INFO collection: Accessing corpus ...
17-Oct-2016 17:43:56 INFO gensim.corpora.indexedcorpus: loaded corpus index from out_off/corpus_off.mm.index
17-Oct-2016 17:43:56 INFO gensim.matutils: initializing corpus reader from out_off/corpus_off.mm
17-Oct-2016 17:43:56 INFO gensim.matutils: accepted corpus with 6 documents, 1950 features, 1839 non-zero entries
17-Oct-2016 17:43:56 INFO collection: Accessing model ...
17-Oct-2016 17:43:56 INFO gensim.utils: loading LdaModel object from out_off/corpus_off.lda
17-Oct-2016 17:43:56 INFO gensim.utils: loading id2word recursively from out_off/corpus_off.lda.id2word.* with mmap=None
17-Oct-2016 17:43:56 INFO gensim.utils: setting ignored attribute dispatcher to None
17-Oct-2016 17:43:56 INFO gensim.utils: setting ignored attribute state to None
17-Oct-2016 17:43:56 INFO gensim.utils: loading LdaModel object from out_off/corpus_off.lda.state
17-Oct-2016 17:43:56 INFO collection: Accessing doc_labels ...
17-Oct-2016 17:43:57 INF

In [16]:
# Ich habe das Gefühl, dass der Fehler 
# eigentlich ganz offensichtlich ist...

vis.make_heatmap()

17-Oct-2016 17:47:55 INFO collection: Loading topic distribution from model ...


AttributeError: 'LdaModel' object has no attribute 'random_state'