In [None]:
from dariah_topics import preprocessing
from dariah_topics import postprocessing
from dariah_topics import visualization
import metadata_toolbox.utils as metadata
from pathlib import Path
import pandas as pd
import lda

In [None]:
path_to_corpus = Path('grenzboten_sample')
pattern = '{author}_{year}_{title}'

In [None]:
meta = pd.concat([metadata.fname2metadata(path, pattern=pattern) for path in path_to_corpus.glob('*.txt')])
meta[:5] # by adding '[:5]' to the variable, only the first 5 elements will be printed

In [None]:
corpus = list(preprocessing.read_from_pathlist(meta.index))
corpus[0][:255] # printing the first 255 characters of the first document

In [None]:
tokenized_corpus = [list(preprocessing.tokenize(document)) for document in corpus]
tokenized_corpus[0][0:13]

In [None]:
document_term_matrix = preprocessing.create_document_term_matrix(tokenized_corpus, meta['title'])
document_term_matrix[:5]

In [None]:
stopwords = preprocessing.find_stopwords(document_term_matrix, most_frequent_tokens=100)

In [None]:
hapax_legomena = preprocessing.find_hapax_legomena(document_term_matrix)
print("Total number of types in corpus:", document_term_matrix.shape[1])
print("Total number of hapax legomena:", len(hapax_legomena))

In [None]:
path_to_stopwordlist = 'tutorial_supplementals/stopwords/de.txt'
external_stopwords = [line.strip() for line in open(path_to_stopwordlist, 'r', encoding='utf-8')]

In [None]:
features = stopwords + hapax_legomena + external_stopwords
document_term_matrix = preprocessing.remove_features(features, document_term_matrix=document_term_matrix)

In [None]:
vocabulary = document_term_matrix.columns
vocabulary

In [None]:
document_term_matrix_arr = document_term_matrix.as_matrix().astype(int)
document_term_matrix_arr

In [None]:
%%time

model = lda.LDA(n_topics=10, n_iter=1000)
model.fit(document_term_matrix_arr)

In [None]:
topics = postprocessing.show_topics(model=model,
                                    vocabulary=vocabulary)
topics

In [None]:
document_topics = postprocessing.show_document_topics(model=model,
                                                      topics=topics,
                                                      document_labels=meta['title'])
document_topics

In [None]:
topic_key_weights = postprocessing.show_topic_key_weights(topic_no=1,
                                                          num_keys=30,
                                                          model=model,
                                                          vocabulary=vocabulary,
                                                          sort_ascending=False)
topic_key_weights[:5]

# Visualizations

One function, one class:
* `plot_wordcloud()`
* `PlotDocumentTopics` visualizes everything related to document-topics proportions.

### `plot_wordcloud()`

In [None]:
visualization.notebook_handling()

In [None]:
wordcloud = visualization.plot_wordcloud(weights=topic_key_weights,
                                         background_color='white',
                                         width=1500,
                                         height=500,
                                         enable_notebook=True)

### `PlotDocumentTopics`

In [None]:
PlotDocumentTopics = visualization.PlotDocumentTopics(document_topics,
                                                      enable_notebook=True)

### `static_heatmap`

In [None]:
static_heatmap = PlotDocumentTopics.static_heatmap(colorbar=False)

### `static_barchart_per_topic`

In [None]:
static_barchart_per_topic = PlotDocumentTopics.static_barchart_per_topic(index=0, # or index='abgewiesen südlich genommen'
                                                                         describer='Topic',
                                                                         alpha=None,
                                                                         figsize=(11, 7))

### `static_barchart_per_document`

In [None]:
static_barchart_per_document = PlotDocumentTopics.static_barchart_per_document(index=0,
                                                                               describer='Document')

### `interactive_heatmap`

In [None]:
interactive_heatmap = PlotDocumentTopics.interactive_heatmap(width=800,
                                                             height=550,
                                                             colorbar=False)

### `interactive_barchart_per_topic`

In [None]:
interactive_barchart_per_topic = PlotDocumentTopics.interactive_barchart_per_topic(index=0,
                                                                                   describer='Topic',
                                                                                   width=800)

### `interactive_barchart_per_document`

In [None]:
interactive_barchart_per_document = PlotDocumentTopics.interactive_barchart_per_document(index=0,
                                                                                         describer='Document',
                                                                                         width=800)

### `to_file`

```python
@staticmethod
def to_file(fig, filename):
    import matplotlib
    import bokeh
    if isinstance(fig, bokeh.plotting.figure.Figure):
        ext = os.path.splitext(filename)[1]
        if ext == '.png':
            export_png(fig, filename)
        elif ext == '.svg':
            fig.output_backend = 'svg'
            export_svgs(fig, filename)
        elif ext == '.html':
            output_file(filename)
    elif isinstance(fig, matplotlib.figure.Figure):
         fig.savefig(filename)
```