In [None]:
from datenguidepy.query_execution import QueryExecutioner

In [None]:
qe = QueryExecutioner()
rs = qe.get_type_info('Region')

In [None]:
descr = {f : rs.fields[f].get('description') for f in rs.fields if f not in ['id','name']}

In [None]:
with open('stopwords_german') as f:
    sw_de_raw = f.readlines()
    
sw_de = list(w.strip() for w in sw_de_raw)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = list(descr.values()) #dataset.data
custom_stop_words = ['personen', 'bevölkerung', 'indikator', 'anteil', '2011',
       'begriffsinhalt', 'erläuterung', 'alter', 'daten', 'mai']

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=sw_de)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=sw_de,
                               token_pattern='(?u)\\b\\w\\w\\w\\w+\\b')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
# display_topics(lda, tf_feature_names, no_top_words)

In [None]:
gen = zip(descr.keys(),np.argmax(lda.transform(tf[:,:]),axis=1),descr.values())

In [None]:
import pandas as pd
df = pd.DataFrame(list(gen),columns = ['statistic','topic','description'])

In [None]:
df.query('topic == 1')

In [None]:
words = pd.DataFrame(list(zip(tf.sum(axis=0).getA().squeeze(),tf_feature_names)),columns=['count','word' ])

In [None]:
words.sort_values('count',ascending=False).head(10)

In [None]:
import re
# [(v[:30],k) for k,v in  descr.items() if k[:1] == 'A']