# Testing `collection.py`

In [10]:
import collection1

In [11]:
path_txt = "corpus"
path_csv = "corpus_csv"

##### Liste mit Dateinamen erzeugen

In [12]:
doclist_txt = collection1.create_document_list(path_txt)
doclist_txt[:5]

25-Oct-2016 16:03:49 INFO collection: Creating document list from .txt files ...


['corpus/Poe_EurekaAProsePoem.txt',
 'corpus/Howard_TheDevilinIron.txt',
 'corpus/Lovecraft_TheShunnedHouse.txt',
 'corpus/Howard_SchadowsinZamboula.txt',
 'corpus/Doyle_AStudyinScarlet.txt']

In [13]:
help(collection1.create_document_list)

Help on function create_document_list in module collection1:

create_document_list(path, ext='.txt')
    Creates a list of files with their full path.
    Args:
        path (str): Path to folder, e.g. '/tmp/corpus'.
        ext (str): File extension, e.g. '.csv'. Defaults to '.txt'.
    Returns:
        list[str]: List of files with full path.



##### Corpus laden

In [14]:
corpus = collection1.read_from_txt(doclist_txt)

testdoc = next(corpus)
print(len(testdoc))

25-Oct-2016 16:03:51 INFO collection: Accessing documents ...


230868


##### Segmenter

In [15]:
segments = collection1.segmenter(corpus, 1000)
testdoc = next(segments)
print(len(testdoc))

25-Oct-2016 16:04:03 INFO collection: Segmenting documents ...


1000


##### Liste mit CSV-Dateien erzeugen

In [16]:
doclist_csv = collection1.create_document_list(path_csv, ext='.csv')
doclist_csv[:5]

25-Oct-2016 16:04:12 INFO collection: Creating document list from .csv files ...


['corpus_csv/Howard_GodsoftheNorth.txt.csv',
 'corpus_csv/Poe_EurekaAProsePoem.txt.csv',
 'corpus_csv/Poe_TheMasqueoftheRedDeath.txt.csv',
 'corpus_csv/Poe_ThePurloinedLetter.txt.csv',
 'corpus_csv/Howard_ShadowsintheMoonlight.txt.csv']

##### CSV Dateien in einen Dataframe einlesen

In [17]:
columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

csv = collection1.read_from_csv(doclist_csv, columns)
testdoc = next(csv)
print(testdoc)

25-Oct-2016 16:04:14 INFO collection: Accessing documents ...


      ParagraphId  TokenId        Lemma  CPOS NamedEntity
0               0        0         gods    NN           _
1               0        1           of    PP           _
2               0        2          the   ART           _
3               0        3        north    NP           _
4               1        4           by    PP           _
5               1        5       robert    NP           _
6               1        6           e.    NP           _
7               1        7       howard    NP           _
8               2        8            [  CONJ           _
9               2        9  transcriber    NP           _
10              2       10           's     O           _
11              2       11         note    NP           _
12              2       12            :  PUNC           _
13              2       13   originally   ADV           _
14              2       14      publish     V           _
15              2       15           in    PP           _
16            

##### POS-Tags ausw√§hlen

In [18]:
pos_tags = ['ADJ', 'V', 'NN']

corpusCSV = collection1.filter_POS_tags(csv, pos_tags)

testdoc = next(corpusCSV)
print(len(testdoc))

25-Oct-2016 16:04:28 INFO collection: Accessing selected POS-tags ...


AttributeError: 'generator' object has no attribute 'next'

In [20]:
labels = collection1.get_labels(doclist_txt)
list(labels)

25-Oct-2016 16:04:44 INFO collection: Creating document labels ...


['Poe_EurekaAProsePoem.txt',
 'Howard_TheDevilinIron.txt',
 'Lovecraft_TheShunnedHouse.txt',
 'Howard_SchadowsinZamboula.txt',
 'Doyle_AStudyinScarlet.txt',
 'Poe_TheCaskofAmontillado.txt',
 'Poe_TheMasqueoftheRedDeath.txt',
 'Howard_GodsoftheNorth.txt',
 'Kipling_TheEndofthePassage.txt',
 'Doyle_TheSignoftheFour.txt',
 'Kipling_TheJungleBook.txt',
 'Doyle_AScandalinBohemia.txt',
 'Poe_ThePurloinedLetter.txt',
 'Lovecraft_AttheMountainofMadness.txt',
 'Kipling_ThyServantaDog.txt',
 'Howard_ShadowsintheMoonlight.txt',
 'Doyle_TheHoundoftheBaskervilles.txt']

In [None]:
mycounter = makeCounter(doclist)

##### Visualisierung

In [None]:
# Ingest Gensim stuff

lda_model = 'out_off/corpus_off.lda'
corpus = 'out_off/corpus_off.mm'
dictionary = 'out_off/corpus_off.dict'
doc_labels = 'out_off/corpus_off_doclabels.txt'
interactive  = False

vis = collection.Visualization(lda_model, corpus, dictionary, doc_labels, interactive)

##### Heatmap

In [None]:
vis.make_heatmap()

##### Interactive

In [None]:
# Ingest Gensim stuff

lda_model = 'out_off/corpus_off.lda'
corpus = 'out_off/corpus_off.mm'
dictionary = 'out_off/corpus_off.dict'
doc_labels = 'out_off/corpus_off_doclabels.txt'
interactive  = True

vis = collection.Visualization(lda_model, corpus, dictionary, doc_labels, interactive)

In [None]:
interactive = vis.make_interactive()

In [None]:
path = ""
vis.save_interactive(path)