# Topic Visualization

In [1]:
import pymongo
from pprint import pprint
import pandas as pd
import requests 
import json 
import squarify 
import plotly.express as px
import warnings
import numpy as np
from functools import reduce
from sqlalchemy import create_engine, text, inspect, types
from sqlalchemy_utils import database_exists, create_database
import sqlalchemy
import os
from dotenv import load_dotenv
import datetime 
import decimal
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## Load Data

### Mongo DB

In [2]:
## Load .env file
load_dotenv('../config/.env')

True

In [3]:
## Database details
## Import credentials for mongo DB from .env file
USERNAME_MONGO = os.environ.get('USERNAME_MONGO', "xxxxxx")
PASSWORD_MONGO = os.environ.get('PASSWORD_MONGO', "xxxxxx")
ENDPOINT_MONGO = os.environ.get('ENDPOINT_MONGO', "xxxxx")

connection_string = f'mongodb+srv://{USERNAME_MONGO}:{PASSWORD_MONGO}@{ENDPOINT_MONGO}'
database = "kuzu"
collection = "warehouse"

## Connection to MongoDB
client = pymongo.MongoClient(connection_string)
db = client[database]
collection = db[collection]

In [4]:
## Count number of documents inserted
collection.count_documents({})

131825

In [5]:
## Convert to dataframe
df = pd.DataFrame(list(collection.find()))
df = df.sort_values("Date",ascending="False")[["Date","Value","S_alter","Language"]]

In [12]:
## Filter to German
df = df[df["Language"]=="Deutsch"]
len(df)

93507

### Local csv file

In [72]:
## Import CSVs
df = pd.read_csv('../data/Data.csv') #load table
df = df.sort_values("Date",ascending="False")[["Date","Value","S_alter","Language"]]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [73]:
## Filter to German
df = df[df["Language"]=="Deutsch"]
len(df)

93507

## Topic modeling and visualization

Now that the text of all participant, we can proceed to light preprocessing before applying Latent Dirichlet Allocation.

In [74]:
%matplotlib inline  
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LsiModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import defaultdict
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import pandas as pd
import numpy as np
import pickle
import lda

In [75]:
# Import german stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = list(stopwords.words('german'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dominik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
def tokenize(text):
    return [token for token in simple_preprocess(text)if token not in stopwords]

In [77]:
feedback = df.Value
documents = feedback.tolist()

In [78]:
texts = [tokenize(document) for document in documents]

In [79]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [86]:
min_token_frequency = 100

In [87]:
texts = [[token for token in text if frequency[token] > min_token_frequency] for text in texts]

In [88]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

## Finding the Optimum Number of Topics

Now that the data is ready, we can run a batch LDA (because of the small size of the dataset that we are working with) to discover the main topics in our document.

In [89]:
# fit LDA model
feedback_topics = LdaModel(corpus=corpus,
                           id2word=dictionary,
                           num_topics=12,
                           passes=1)

In [90]:
# print out first 10 topics
for i, topic in enumerate(feedback_topics.print_topics(10)):
    print(i, topic)

0 (3, '0.034*"zug" + 0.023*"bus" + 0.018*"stunde" + 0.018*"beim" + 0.014*"aussteigen" + 0.012*"pro" + 0.011*"bahnhof" + 0.011*"fährt" + 0.010*"fahrräder" + 0.010*"kinder"')
1 (9, '0.047*"mehr" + 0.036*"bitte" + 0.028*"maskenpflicht" + 0.024*"plätze" + 0.022*"besser" + 0.020*"anbieten" + 0.019*"geben" + 0.017*"kontrolle" + 0.016*"bessere" + 0.015*"kunden"')
2 (4, '0.080*"zürich" + 0.030*"verbindungen" + 0.029*"hb" + 0.028*"strecke" + 0.025*"mehr" + 0.024*"umsteigen" + 0.023*"basel" + 0.023*"verbindung" + 0.022*"wäre" + 0.021*"st"')
3 (5, '0.034*"wc" + 0.030*"velo" + 0.025*"oft" + 0.024*"kinderwagen" + 0.023*"unterwegs" + 0.022*"vielen" + 0.020*"bahnhof" + 0.018*"zug" + 0.017*"zugbegleiter" + 0.017*"verbessern"')
4 (2, '0.036*"bahnhof" + 0.035*"zug" + 0.027*"bus" + 0.023*"min" + 0.020*"minuten" + 0.017*"ab" + 0.015*"lange" + 0.014*"postauto" + 0.013*"anschluss" + 0.013*"abend"')
5 (1, '0.033*"zug" + 0.029*"bern" + 0.021*"teuer" + 0.020*"fahrt" + 0.018*"preis" + 0.016*"öv" + 0.015*"hoch" 

In [91]:
vis_data = gensimvis.prepare(feedback_topics, corpus, dictionary)
pyLDAvis.display(vis_data)

  from imp import reload
  from imp import reload
  from imp import reload
