# Testing `collection.py`

In [2]:
import collection

In [3]:
path_txt = "/users/severin/git/Topics/corpus"
path_csv = "/users/severin/git/Topics/corpus_csv"

##### Liste mit Dateinamen erzeugen

In [4]:
doclist_txt = collection.create_document_list(path_txt)
doclist_txt[:5]

['/users/severin/git/Topics/corpus/Doyle_AScandalinBohemia.txt',
 '/users/severin/git/Topics/corpus/Doyle_AStudyinScarlet.txt',
 '/users/severin/git/Topics/corpus/Doyle_TheHoundoftheBaskervilles.txt',
 '/users/severin/git/Topics/corpus/Doyle_TheSignoftheFour.txt',
 '/users/severin/git/Topics/corpus/Howard_GodsoftheNorth.txt']

In [5]:
help(collection.create_document_list)

Help on function create_document_list in module collection:

create_document_list(path, ext='.txt')
    Creates a list of files with their full path.
    
    Args:
        path (str): Path to folder, e.g. '/tmp/corpus'.
        suffix (str): File extension, e.g. '.csv'. Defaults to '.txt'.
    
    Returns:
        list[str]: List of files with full path.



##### Corpus laden

In [6]:
corpus = collection.ReadFromTXT(doclist_txt)
corpus.__dict__

{'doclist': ['/users/severin/git/Topics/corpus/Doyle_AScandalinBohemia.txt',
  '/users/severin/git/Topics/corpus/Doyle_AStudyinScarlet.txt',
  '/users/severin/git/Topics/corpus/Doyle_TheHoundoftheBaskervilles.txt',
  '/users/severin/git/Topics/corpus/Doyle_TheSignoftheFour.txt',
  '/users/severin/git/Topics/corpus/Howard_GodsoftheNorth.txt',
  '/users/severin/git/Topics/corpus/Howard_SchadowsinZamboula.txt',
  '/users/severin/git/Topics/corpus/Howard_ShadowsintheMoonlight.txt',
  '/users/severin/git/Topics/corpus/Howard_TheDevilinIron.txt',
  '/users/severin/git/Topics/corpus/Kipling_TheEndofthePassage.txt',
  '/users/severin/git/Topics/corpus/Kipling_TheJungleBook.txt',
  '/users/severin/git/Topics/corpus/Kipling_ThyServantaDog.txt',
  '/users/severin/git/Topics/corpus/Lovecraft_AttheMountainofMadness.txt',
  '/users/severin/git/Topics/corpus/Lovecraft_TheShunnedHouse.txt',
  '/users/severin/git/Topics/corpus/Poe_EurekaAProsePoem.txt',
  '/users/severin/git/Topics/corpus/Poe_TheCaskof

##### Segmenter

In [7]:
segments = []

for document in list(corpus):
   segments.append(collection.Segmenter(document, 1000))

for s in list(segments[1])[:8]:
    print(s[:7])

['A', 'STUDY', 'IN', 'SCARLET.', 'By', 'A.', 'Conan']
['to', 'go', 'halves', 'with', 'him', 'in', 'some']
['small', 'quantity', 'of', 'blood', 'to', 'a', 'litre']
['large', 'airy', 'sitting-room,', 'cheerfully', 'furnished,', 'and', 'illuminated']
['the', 'moon', 'it', 'would', 'not', 'make', 'a']
['well', 'consider', 'him', 'as', 'a', 'necromancer.', '"From']
["hour's", 'silence', 'is', 'really', 'very', 'showy', 'and']
['are', 'marks', 'of', 'blood', 'in', 'the', 'room,']


##### Liste mit CSV-Dateien erzeugen

In [8]:
doclist_csv = collection.create_document_list(path_csv, ext='.csv')
doclist_csv[:5]

['/users/severin/git/Topics/corpus_csv/Doyle_AStudyinScarlet.txt.csv',
 '/users/severin/git/Topics/corpus_csv/Doyle_TheHoundoftheBaskervilles.txt.csv',
 '/users/severin/git/Topics/corpus_csv/Doyle_TheSignoftheFour.txt.csv',
 '/users/severin/git/Topics/corpus_csv/Howard_GodsoftheNorth.txt.csv',
 '/users/severin/git/Topics/corpus_csv/Howard_SchadowsinZamboula.txt.csv']

##### POS-Tags auswählen

In [9]:
pos_tags = ['ADJ', 'V', 'NN']

corpusCSV = collection.FilterPOS(doclist_csv, pos_tags)
corpusCSV.__dict__

{'columns': ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'],
 'doc': Empty DataFrame
 Columns: []
 Index: [],
 'files': ['/users/severin/git/Topics/corpus_csv/Doyle_AStudyinScarlet.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Doyle_TheHoundoftheBaskervilles.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Doyle_TheSignoftheFour.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Howard_GodsoftheNorth.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Howard_SchadowsinZamboula.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Howard_ShadowsintheMoonlight.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Howard_TheDevilinIron.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Kipling_TheEndofthePassage.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Kipling_TheJungleBook.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Kipling_ThyServantaDog.txt.csv',
  '/users/severin/git/Topics/corpus_csv/Lovecraft_AttheMountainofMadness.txt.csv',
  '/users/severin/git/Topics/corpus_csv

In [10]:
labels = corpusCSV.get_labels()
list(labels)

['Doyle_AStudyinScarlet.txt.csv',
 'Doyle_TheHoundoftheBaskervilles.txt.csv',
 'Doyle_TheSignoftheFour.txt.csv',
 'Howard_GodsoftheNorth.txt.csv',
 'Howard_SchadowsinZamboula.txt.csv',
 'Howard_ShadowsintheMoonlight.txt.csv',
 'Howard_TheDevilinIron.txt.csv',
 'Kipling_TheEndofthePassage.txt.csv',
 'Kipling_TheJungleBook.txt.csv',
 'Kipling_ThyServantaDog.txt.csv',
 'Lovecraft_AttheMountainofMadness.txt.csv',
 'Lovecraft_TheShunnedHouse.txt.csv',
 'Poe_EurekaAProsePoem.txt.csv',
 'Poe_TheCaskofAmontillado.txt.csv',
 'Poe_TheMasqueoftheRedDeath.txt.csv',
 'Poe_ThePurloinedLetter.txt.csv']

##### Lemma

In [11]:
lemma = corpusCSV.get_lemma()
list(lemma)[:1]

[37       typographical
 56             textual
 59              square
 72                 old
 75                such
 80             present
 112           original
 122           original
 139              ascii
 147            latin-1
 154            present
 161             french
 163            spanish
 169             proper
 294             second
 320               deep
 334              other
 340               same
 365                new
 406              fatal
 430         subclavian
 442          murderous
 458            orderly
 475            british
 483               weak
 486          prolonged
 499              great
 525               able
 537             little
 548            enteric
              ...      
 50767     irresistible
 50813           sudden
 50817           likely
 50833            least
 50870         original
 50878             arab
 50879        detective
 50918            fresh
 50933       unexpected
 50973            whole
 50979          

##### Visualisierung

In [12]:
# Ingest Gensim stuff

lda_model = 'out_off/corpus_off.lda'
corpus = 'out_off/corpus_off.mm'
dictionary = 'out_off/corpus_off.dict'
doc_labels = 'out_off/corpus_off_doclabels.txt'
interactive  = False

vis = collection.Visualization(lda_model, corpus, dictionary, doc_labels, interactive)

17-Oct-2016 16:25:51 INFO collection: Accessing corpus ...
17-Oct-2016 16:25:51 INFO gensim.corpora.indexedcorpus: loaded corpus index from out_off/corpus_off.mm.index
17-Oct-2016 16:25:51 INFO gensim.matutils: initializing corpus reader from out_off/corpus_off.mm
17-Oct-2016 16:25:51 INFO gensim.matutils: accepted corpus with 6 documents, 1950 features, 1839 non-zero entries
17-Oct-2016 16:25:51 INFO collection: Accessing model ...
17-Oct-2016 16:25:51 INFO gensim.utils: loading LdaModel object from out_off/corpus_off.lda
17-Oct-2016 16:25:51 INFO gensim.utils: loading id2word recursively from out_off/corpus_off.lda.id2word.* with mmap=None
17-Oct-2016 16:25:51 INFO gensim.utils: setting ignored attribute dispatcher to None
17-Oct-2016 16:25:51 INFO gensim.utils: setting ignored attribute state to None
17-Oct-2016 16:25:51 INFO gensim.utils: loading LdaModel object from out_off/corpus_off.lda.state
17-Oct-2016 16:25:51 INFO collection: Accessing doc_labels ...
17-Oct-2016 16:25:51 INF

##### Heatmap

##### Interactive

In [None]:
interactive = "True"
visual = collection.Visualization("out_off/corpus_off.lda", "out_off/corpus_off.mm", "out_off/corpus_off.dic", "out_off/corpus_off_doclabels.txt", interactive)

In [None]:
corpus, model, dictionary = visual.loadGensimOutput()
dictionary