In [59]:
from dariah_topics import preprocessing as pre
from dariah_topics import visualization as visual
from dariah_topics import evaluation

## Preprocessing

#### Liste mit Dateinamen erzeugen

In [60]:
path_txt = "corpus_txt"
#path_txt = "grenzbote_plain/*/"
#path_txt = "wiki/"

doclist_txt = pre.create_document_list(path_txt)
assert doclist_txt, "No documents found"
doclist_txt[:5]

INFO preprocessing: Creating document list from TXT files ...


['corpus_txt/Lovecraft_AttheMountainofMadness.txt',
 'corpus_txt/Howard_TheDevilinIron.txt',
 'corpus_txt/Poe_ThePurloinedLetter.txt',
 'corpus_txt/Lovecraft_TheShunnedHouse.txt',
 'corpus_txt/Poe_TheMasqueoftheRedDeath.txt']

#####  Liste mit Dokumentenlabels erzeugen - (Funktion wird durch Thorsten's generischere Funktion ersetzt)

In [61]:
doc_labels = list(pre.get_labels(doclist_txt))
doc_labels[:5]

INFO preprocessing: Creating document labels ...


['Lovecraft_AttheMountainofMadness',
 'Howard_TheDevilinIron',
 'Poe_ThePurloinedLetter',
 'Lovecraft_TheShunnedHouse',
 'Poe_TheMasqueoftheRedDeath']

#### Corpus laden

In [62]:
corpus_txt = pre.read_from_txt(doclist_txt)

#### Segmentieren

In [70]:
segments = pre.segment(corpus_txt, 1000)
segments


<generator object segment_fuzzy at 0x7f77c36095c8>

#### Tokenisieren

In [5]:
doc_tokens = [list(pre.tokenize(txt)) for txt in list(corpus_txt)]

INFO preprocessing: Accessing TXT documents ...


#### Create Dictionaries

In [51]:
id_types = pre.create_dictionary(doc_tokens)
doc_ids = pre.create_dictionary(doc_labels)
id_types

{'afeared': 1,
 'hones': 4198,
 'men-folk': 2,
 'shuts': 4,
 'slaty-blue': 5,
 'creep': 6,
 'feat': 7,
 'brumby': 10,
 'muscled': 11,
 'ax': 12,
 'mention': 13,
 'finger-tips': 14,
 'sloppy': 11143,
 'calf': 16,
 'rascals': 17,
 'formless': 11142,
 'beast': 18,
 'stone-still': 19,
 'pyramids': 21,
 'globe': 22,
 'datum': 23,
 'donna': 24,
 'saddles': 25,
 'indulged': 26,
 'convolvuluses': 27,
 'fitful': 28,
 'unhurt': 29,
 'ugh': 30,
 'scandalous': 31,
 'wented-past': 32,
 'tinted': 33,
 'sill': 34,
 'represented': 18920,
 'bartholomew': 7413,
 'jewels': 35,
 'stenches': 36,
 'tarry': 37,
 'dot': 38,
 'half-luminous': 39,
 'much-discussed': 40,
 'fog-wreaths': 41,
 'acquiring': 42,
 'starving': 43,
 'habitual': 44,
 'uniformly': 45,
 'ratlook': 48,
 'abruptness': 47,
 'employer': 49,
 'glad': 50,
 'ran': 51,
 'satellites': 52,
 'vapor': 53,
 'half-strangling': 54,
 'chemical': 56,
 'wedding': 57,
 'selfish': 58,
 'misfortunes': 59,
 'tending': 60,
 'wah': 61,
 'borders': 62,
 'visage':

#### Sparse BOW Model

In [8]:
sparse_bow = pre.create_sparse_bow(doc_labels, doc_tokens, id_types, doc_ids)

In [50]:
sparse_bow

Unnamed: 0_level_0,Unnamed: 1_level_0,0
doc_id,token_id,Unnamed: 2_level_1
1,13639,2
1,8195,1
1,16388,1
1,6,1
1,12289,1
1,16392,1
1,9,1
1,5463,1
1,8204,1
1,16397,1


## Topic Modeling with Gensim

##### Saving Sparse BOW

In [10]:
pre.save_sparse_bow(sparse_bow, "gensim_txt")

#### Import Market Matrix

In [11]:
from gensim.models import LdaModel
from gensim.corpora import MmCorpus
from gensim.corpora import SvmLightCorpus

INFO summa.preprocessing.cleaner: 'pattern' package not found; tag filters are not available for English


In [12]:
mm = MmCorpus("gensim_txt.mm")

INFO gensim.matutils: initializing corpus reader from gensim_txt.mm
INFO gensim.matutils: accepted corpus with 17 documents, 22232 features, 369906 non-zero entries


In [13]:
corpus = MmCorpus('gensim_txt.mm')
SvmLightCorpus.serialize('corpus.svmlight', corpus)

INFO gensim.matutils: initializing corpus reader from gensim_txt.mm
INFO gensim.matutils: accepted corpus with 17 documents, 22232 features, 369906 non-zero entries
INFO gensim.corpora.svmlightcorpus: converting corpus to SVMlight format: corpus.svmlight
INFO gensim.corpora.indexedcorpus: saving SvmLightCorpus index to corpus.svmlight.index


In [14]:
doc2id = {value : key for key, value in doc_ids.items()}
type2id = {value : key for key, value in id_types.items()}

##### Remove Features

type2id = {value : key for key, value in id_types.items()}
sparse_bow_collapsed = sparse_bow.groupby(sparse_bow.index.get_level_values('token_id')).sum()

sparse_bow_hapax = sparse_bow_collapsed.loc[sparse_bow_collapsed[0] == 1]
hapax = [type2id[key] for key in sparse_bow_hapax.index.get_level_values('token_id')]

len(hapax)

In [15]:
import os.path
basepath = os.path.abspath('.')

with open(os.path.join(basepath, "tutorial_supplementals", "stopwords", "de.txt"), 'r', encoding = 'utf-8') as f: 
    stopword_list = f.read().split('\n')
    
stopword_list = set(stopword_list)

In [16]:
hapax_from_remove = pre.find_hapax(sparse_bow, id_types)
stopwords_from_remove = pre.find_stopwords(sparse_bow, id_types, mfw=75)

#features_to_be_removed = set(hapax_from_remove + stopwords_from_remove)
features_to_be_removed = stopwords_from_remove

sparse_bow_short = pre.remove_features(sparse_bow, id_types, features_to_be_removed)

INFO preprocessing: Finding hapax legomena ...
INFO preprocessing: Finding stopwords ...
INFO preprocessing: Removing features ...


In [17]:
len(features_to_be_removed)

75

In [18]:
pre.save_sparse_bow(sparse_bow_short, "gensim_txt")

In [19]:
mm = MmCorpus("gensim_txt.mm")

INFO gensim.matutils: initializing corpus reader from gensim_txt.mm
INFO gensim.matutils: accepted corpus with 17 documents, 22232 features, 192053 non-zero entries


#### Convert sparse_bow to list of (doc, tokens) tuples (like doc2bow)

In [20]:
doc2bow_list = []

for doc in sparse_bow_short.index.groupby(sparse_bow_short.index.get_level_values('doc_id')):
    temp = [(token, count) for token, count in zip(sparse_bow_short.loc[doc].index, sparse_bow_short.loc[doc][0])]
    doc2bow_list.append(temp)

In [57]:
doc2bow_list[0]

for

Series([], Name: 0, dtype: int64)

In [22]:
model = LdaModel(corpus=mm, id2word=type2id, num_topics=20, passes = 10, iterations = 20)

INFO gensim.models.ldamodel: using symmetric alpha at 0.05
INFO gensim.models.ldamodel: using symmetric eta at 4.49781855799937e-05
INFO gensim.models.ldamodel: using serial LDA version on this node
INFO gensim.models.ldamodel: running online LDA training, 20 topics, 10 passes over the supplied corpus of 17 documents, updating model once every 17 documents, evaluating perplexity every 17 documents, iterating 20x with a convergence threshold of 0.001000
INFO gensim.models.ldamodel: -14.378 per-word bound, 21297.8 perplexity estimate based on a held-out corpus of 17 documents with 192053 words
INFO gensim.models.ldamodel: PROGRESS: pass 0, at document #17/17
INFO gensim.models.ldamodel: topic #2 (0.050): 0.003*"wayfarers" + 0.003*"step-daughter" + 0.003*"grouping" + 0.003*"faster" + 0.003*"ape" + 0.003*"frenzied" + 0.003*"spies" + 0.003*"streaked" + 0.003*"loves" + 0.003*"rigging"
INFO gensim.models.ldamodel: topic #5 (0.050): 0.003*"scuffling" + 0.003*"envisage" + 0.003*"ape" + 0.003*"a

KeyboardInterrupt: 

In [None]:
model.get_document_topics(doc2bow_list[1])

In [None]:
#Anzeige der keywords für topic n
n = 1
topic_nr_x = model.get_topic_terms(n)

topicTerms = [type2id[i[0]] for i in topic_nr_x]
topicTerms

In [None]:
#Topic probabilities - zum Testen der Funktion
topics = model.show_topics(num_topics = model.num_topics, formatted=False)
keywords = []

num_topics = model.num_topics
topics_df = pre.pd.DataFrame(index = range(num_topics), columns= range(10))

for topic, values in topics:
    keyword = [value[0] for value in values]
    topics_df.loc[topic] = keyword

topics_df

In [None]:
topics_df = pre.gensim2dataframe(model)
topics_df

In [None]:
doc_topic = topics_df.T
doc_topic

## Visualisierung 

In [None]:
'''lda_model = 'out_easy/corpus.lda'
corpus = 'out_easy/corpus.mm'
dictionary = 'out_easy/corpus.dict'
doc_labels = 'out_easy/corpus_doclabels.txt'
interactive  = False

vis = visual.Visualization(lda_model, corpus, dictionary, doc_labels, interactive)'''

##### Create a document-topic matrix (that is a pandas data frame actually)

In [None]:
doc_topic = visual.create_doc_topic(mm, model, doc_labels)

##### Visualize document-topic matrix

In [None]:
%matplotlib inline
visual.doc_topic_heatmap(doc_topic)

##### Visualize topic distribution in a single document

In [None]:
visual.plot_doc_topics(doc_topic, 0)

In [None]:
items = model.show_topic(1)
keywords = [item[0] for item in items]
keywords
items = model.show_topic(1)
propabilities = [item[1] for item in items]
propabilities
items

In [None]:
topic_nr = 3
number_of_keywords = 10
visual.show_wordle_for_topic(model, topic_nr, number_of_keywords)

![success](http://cdn2.hubspot.net/hub/128506/file-446943132-jpg/images/computer_woman_success.jpg)