In [4]:
import pandas as pd
import gensim

In [2]:
## Import dataframe
filelocation = 'data/DataText'
df = pd.read_feather(filelocation)

### Bag of Words on the Data set

Filter out tokens that appear in

- less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size, not absolute number).

after the above two steps, keep only the first 100000 most frequent tokens.

In [6]:
dictionary = gensim.corpora.Dictionary(df.lemmatized)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abteils
1 eingangsbereich
2 mal
3 reisend
4 stauen
5 stosszeit
6 verkürzen
7 vieler
8 weshalb
9 öfters
10 besitzer


In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df.lemmatized]
bow_corpus[1]

[(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]

### Preview Bag Of Words for our sample preprocessed document.

In [25]:
# sample kommentar
num = 22
df.Kommentar[num]

'Der Zug von Interlaken Ost bis Grindelwald-Terminal war überaus gut besetzt.'

In [26]:
# Select a sample doc
bow_doc= bow_corpus[num]

# print words and amout in doc
for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0], 
                                               dictionary[bow_doc[i][0]], 
bow_doc[i][1]))

Word 46 ("zug") appears 1 time.
Word 136 ("besetzen") appears 1 time.
Word 168 ("gut") appears 1 time.
Word 245 ("grindelwald") appears 1 time.
Word 246 ("interlaken") appears 1 time.
Word 247 ("ost") appears 1 time.
Word 248 ("terminal") appears 1 time.
Word 249 ("überaus") appears 1 time.


### TF-IDF

Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [27]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.4951180607895303),
 (1, 0.4532408944177848),
 (2, 0.21465712401511586),
 (3, 0.26508829295469494),
 (4, 0.24827020170784475),
 (5, 0.3536584442733165),
 (6, 0.1924419793653419),
 (7, 0.3398480518993795),
 (8, 0.3063303959641878)]


### Running LDA using Bag of Words

Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [30]:
lda_model = gensim.models.LdaMulticore(
    bow_corpus,
    num_topics=12,
    id2word=dictionary,
    passes=2,
    workers=2
    )

For each topic, we will explore the words occuring in that topic and its relative weight.

In [31]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"zug" + 0.017*"laut" + 0.014*"oft" + 0.010*"leute" + 0.010*"bitte" + 0.010*"maske" + 0.009*"person" + 0.009*"darauf" + 0.009*"mehr" + 0.008*"durchsag"
Topic: 1 
Words: 0.085*"preis" + 0.033*"teuer" + 0.031*"hoch" + 0.016*"gut" + 0.016*"günstig" + 0.012*"strecke" + 0.012*"verbessern" + 0.011*"stimmen" + 0.011*"verspätung" + 0.011*"senken"
Topic: 2 
Words: 0.035*"zug" + 0.031*"zürich" + 0.028*"verspätung" + 0.019*"verbindung" + 0.015*"minute" + 0.014*"fahrt" + 0.014*"fahren" + 0.013*"min" + 0.013*"lang" + 0.012*"bern"
Topic: 3 
Words: 0.035*"mehr" + 0.016*"ja" + 0.014*"finden" + 0.013*"fahren" + 0.012*"sparbillett" + 0.011*"bezahlen" + 0.011*"chf" + 0.010*"geben" + 0.009*"sehen" + 0.009*"teuer"
Topic: 4 
Words: 0.065*"zug" + 0.025*"fahren" + 0.013*"bus" + 0.012*"kommen" + 0.012*"bahnhof" + 0.012*"stehen" + 0.010*"gut" + 0.010*"geben" + 0.008*"nehmen" + 0.008*"spät"
Topic: 5 
Words: 0.070*"app" + 0.043*"sbb" + 0.029*"lösen" + 0.029*"ticket" + 0.022*"kaufen" + 0.018*

### Running LDA using TF-IDF

In [33]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=12,
    id2word=dictionary,
    passes=2,
    workers=4
    )

In [34]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"maskenpflicht" + 0.007*"steckdose" + 0.007*"mehr" + 0.007*"preis" + 0.007*"ga" + 0.006*"sbb" + 0.006*"app" + 0.006*"zug" + 0.006*"angebot" + 0.006*"sparbillet"
Topic: 1 Word: 0.017*"mehr" + 0.016*"sauberkeit" + 0.013*"preis" + 0.013*"teuer" + 0.012*"senken" + 0.011*"stosszeit" + 0.009*"einsetzen" + 0.008*"zug" + 0.007*"gut" + 0.007*"strecke"
Topic: 2 Word: 0.029*"platz" + 0.028*"mehr" + 0.017*"gepäck" + 0.015*"wlan" + 0.013*"koffer" + 0.012*"zug" + 0.009*"wenig" + 0.008*"züge" + 0.007*"velos" + 0.007*"kinderwagen"
Topic: 3 Word: 0.011*"wc" + 0.011*"klasse" + 0.009*"toilette" + 0.009*"mehr" + 0.008*"zug" + 0.007*"wagen" + 0.007*"defekt" + 0.006*"oft" + 0.006*"gut" + 0.006*"bitte"
Topic: 4 Word: 0.014*"zug" + 0.007*"verspätung" + 0.006*"verbindung" + 0.006*"zürich" + 0.005*"goldau" + 0.005*"wifi" + 0.005*"sauber" + 0.005*"arth" + 0.005*"mehr" + 0.005*"min"
Topic: 5 Word: 0.012*"preis" + 0.011*"sitzplatz" + 0.011*"danken" + 0.010*"pünktlich" + 0.009*"hoch" + 0.008*"g