In [9]:
from datenguide_python.query_execution import QueryExecutioner

In [11]:
qe = QueryExecutioner()
rs = qe.get_type_info('Region')

In [35]:
descr = {f : rs.fields[f].get('description') for f in rs.fields if f not in ['id','name']}

In [20]:
with open('stopwords_german') as f:
    sw_de_raw = f.readlines()
    
sw_de = list(w.strip() for w in sw_de_raw)

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = list(descr.values()) #dataset.data
custom_stop_words = ['personen', 'bevölkerung', 'indikator', 'anteil', '2011',
       'begriffsinhalt', 'erläuterung', 'alter', 'daten', 'mai']

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=sw_de)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=sw_de,
                               token_pattern='(?u)\\b\\w\\w\\w\\w+\\b')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
# display_topics(lda, tf_feature_names, no_top_words)

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


In [91]:
gen = zip(descr.keys(),np.argmax(lda.transform(tf[:,:]),axis=1),descr.values())

In [80]:
import pandas as pd
df = pd.DataFrame(list(gen),columns = ['statistic','topic','description'])

In [125]:
df.query('topic == 1')

Unnamed: 0,statistic,topic,description
76,AI0402,1,**Unternehmensinsolvenzen je 10.000 st.-pfl. U...
89,AI0701,1,**Arbeitsplatzdichte**\n*aus GENESIS-Statistik...
90,AI0702,1,"**Anteil Erwerbst. Land- u. Forstwirtsch., Fis..."
91,AI0703,1,**Anteil Erwerbstätige Produzierendes Gewerbe*...
92,AI0704,1,**Anteil Erwerbstätige Verarbeitendes Gewerbe*...
93,AI0705,1,**Anteil Erwerbstätige Baugewerbe**\n*aus GENE...
94,AI0706,1,**Anteil Erwerbstätige Dienstleistungsbereiche...
95,AI0707,1,"**Ant ET Handel, Verkehr, Gastgew., Inform., K..."
96,AI0708,1,"**Ant ET Finanz-, Vers., Unt-dl., Grundst-, Wo..."
97,AI0709,1,"**Ant ET Öffentl. u. sonst. DL, Erzieh., Gesun..."


In [92]:
words = pd.DataFrame(list(zip(tf.sum(axis=0).getA().squeeze(),tf_feature_names)),columns=['count','word' ])

In [93]:
words.sort_values('count',ascending=False).head(10)

Unnamed: 0,count,word
248,345,bevölkerung
90,226,alter
658,216,mai
963,197,zahl
599,189,kinder
334,184,einwohner
950,178,wohnung
569,166,insgesamt
47,149,31
118,149,arbeit


In [124]:
import re
# [(v[:30],k) for k,v in  descr.items() if k[:1] == 'A']