# Bokeh 2D Scatterplot der Reden der Top 10 Redner

## Import

In [None]:
import pandas as pd

from bokeh.plotting import figure, output_file, output_notebook, show
from bokeh.models import ColumnDataSource, CDSView, Legend 
from bokeh.models import CustomJS, Slider, OpenURL, TapTool, CustomJSFilter
from bokeh.models import DatetimeTickFormatter
from bokeh.models.tools import HoverTool, BoxZoomTool, ResetTool, PanTool
from bokeh.transform import factor_cmap, factor_mark
from bokeh.layouts import column, row 
from bokeh.io import show 

from umap import UMAP

import plotly.express as px

output_notebook()

## Daten einlesen

In [None]:
df = pd.read_json('../data/reden-bundestag-doc-embeddings.json')

In [None]:
# change dtype to datetime
df.loc[:, 'datum'] = pd.to_datetime(df.loc[:, 'datum'])

In [None]:
type(df.loc[0, 'doc_embedding'])

## Erstellen der Personenliste

In [None]:
person_counts = pd.crosstab(index=df.loc[:, 'person'], columns='count')

In [None]:
person_counts.sort_values(by='count', ascending=False).head(10)

In [None]:
person_counts.sort_values(by='count', ascending=False).head(10).index

In [None]:
personen_liste = [('Angela Merkel', 'Kanzler:in', 'CDU'),
                  ('Gerhard Schröder', 'Kanzler:in', 'SPD'), 
                  ('Helmut Kohl', 'Kanzler:in', 'CDU'),
                  ('Bernd Neumann', 'Kulturstaatsminister:in', 'CDU'),
                  ('Monika Grütters', 'Kulturstaatsminister:in', 'CDU'),
                  ('Christina Weiss', 'Kulturstaatsminister:in', 'parteilos'),
                  ('Michael Naumann', 'Kulturstaatsminister:in', 'SPD'),
                  ('Julian Nida-Rümelin', 'Kulturstaatsminister:in', 'SPD'), 
                  ('Thomas de Maizière', 'Inneres', 'CDU'),
                  ('Hans Martin Bury', 'Bundeskanzleramt', 'SPD')]

In [None]:
df_top10 = df[(df.loc[:, 'person'] == 'Angela Merkel') | \
              (df.loc[:, 'person'] == 'Gerhard Schröder') |\
              (df.loc[:, 'person'] == 'Helmut Kohl') |\
              (df.loc[:, 'person'] == 'Bernd Neumann') |\
              (df.loc[:, 'person'] == 'Monika Grütters') |\
              (df.loc[:, 'person'] == 'Christina Weiss') |\
              (df.loc[:, 'person'] == 'Michael Naumann') |\
              (df.loc[:, 'person'] == 'Julian Nida-Rümelin') |\
              (df.loc[:, 'person'] == 'Thomas de Maizière') |\
              (df.loc[:, 'person'] == 'Hans Martin Bury') 
             ]

In [None]:
df_top10.shape

## Dimensionsreduktion

In [None]:
reducer = UMAP(n_components=2, metric='cosine', n_neighbors=15, min_dist=0.1)

In [None]:
reduced_matrix = reducer.fit_transform(df_top10.loc[:, 'doc_embedding'].to_list())

In [None]:
reduced_matrix.shape

In [None]:
reduced_matrix

In [None]:
df_2d = pd.DataFrame.from_records(reduced_matrix, columns=['x', 'y'])

In [None]:
df_2d.head()

## Zusammenführen aller Spalten

In [None]:
df_all = df_top10.join(df_2d)

In [None]:
print(df_all.shape)
df_all.head()

### Ergänzen der Features Ressort und Partei

In [None]:
for person in personen_liste:
    mask = df_all.loc[:, 'person'] == person[0]
    df_all.loc[mask, 'ressort'] = person[1]
    df_all.loc[mask, 'partei'] = person[2]

In [None]:
print(df_all.shape)
print(df_all.columns)
df_all.head()

In [None]:
df_all.to_json('../data/reden-2d-aufbereitet-bokeh.json')

## Visualisieren mit bokeh

In [None]:
source = ColumnDataSource(df_all)

In [None]:
PARTEI = ['SPD', 'CDU', 'parteilos']
RESSORT = ['Kanzler:in', 'Kulturstaatsminister:in']
MARKERS = ['hex', 'circle_x', 'triangle']

In [None]:
p = figure(height=875, 
           width=875,
           #sizing_mode="stretch_both", # vergrößert die figure auf die Breite des Browsers
           toolbar_location="above", 
           tools= ['pan', 'wheel_zoom', 'box_zoom', 'save', 'reset', 'tap'])

In [None]:
p.scatter(x='x', y='y', size=5, fill_alpha=0.8,
         color=factor_cmap('partei', ['red', 'black', 'blue'], PARTEI),
         marker=factor_mark('ressort', MARKERS, RESSORT),
         source=source)

In [None]:
# https://stackoverflow.com/questions/49601196/use-of-a-categorical-variable-to-define-colors-and-markers-in-bokeh-scatter-plot

In [None]:
# Verlinkung funtionier: Links funktionieren nicht!
# >>> urls chekcen
taptool = p.select(type=TapTool)
taptool.callback = OpenURL(url='@url')

In [None]:
# add hover 

hover = HoverTool(tooltips=[ 
                            ('Name', '@person'),
                            ('Datum', '@datum{%F %H:%M}'),
                            ('Ressort', '@ressort'),
                            ('Partei', '@partei'),
                            ('Ort', '@ort'),
                            ('Titel', '@titel')
                            ], 
                  formatters={'@datum': 'datetime'})

p.add_tools(hover)

In [None]:
# output to standalone HTML file
output_file('../img/reden-2d-aufbereitet-bokeh.html')

In [None]:
show(p)