# Topics Over Time

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../')) ## needed to import the function.py file
from functions import *
import pandas as pd
import plotly.express as px
import plotly.io as pio

2023-05-25 15:54:19.472972: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
## plot settings
color_discrete_sequence = color_discrete_kuzu #imported from globalvars.py
template = template ##imported from globalvars.py

In [33]:
## Import docs
filelocation = '../../data/DataTextTrain'
data = pd.read_feather(filelocation)
data = data
docs = data["Kommentar"].values
timestamps = data.yearquarter.to_list()
gender = [data["S_sex"][i] for i in range(len(data["S_sex"]))]

# Load the Original Topic Model
topic_model = BERTopic.load("../../models/BERTTopic_paraphrase-multilingual-MiniLM-L12-v2_merged_oulier_reduced.model")
df_topic = topic_model.get_document_info(docs)


# Create full dataframe with all infomration
df = data.join(df_topic)

# Add grouped Age col
df['S_alter'] = pd.to_numeric(df['S_alter'], errors='coerce')
bins = [0, 24, 44, 64, 100]
labels = ["1-24 Jahre", "25-44 Jahre", "45-64 Jahre", "65-100 Jahre"]
df['S_alter_grouped'] = pd.cut(df['S_alter'], bins=bins, labels=labels)

# Create Dataframe for viz
df_topics_by_quarter = get_topic_ratios(df, timeframe_col='yearquarter', name_col='CustomName', topic_col='Topic')

In [34]:
fig = px.line(df_topics_by_quarter,
              x="yearquarter",
              y="Topic_Ratio",
              color="CustomName",
              hover_data=["yearquarter","CustomName","count_x","Topic_Ratio"],
              color_discrete_sequence=color_discrete_sequence,
              template=template)

fig.update_layout(
    title="Themenrelevanz nach Quartal (2019 - 2022)",
    yaxis_title='Häufigkeit',
    xaxis_title='',
    legend_title="",
    width=900, 
    height=450)

fig.update_traces(line_width=2, text=df_topics_by_quarter['CustomName'])

#fig.update_yaxes(range=(0,0.2))
pio.write_image(fig,"../../exports/images/Themencluster_Zeitachse_1.svg") #save fig as svg
fig.show()

In [35]:

df_count_satisfaction = df_topics_by_quarter[df_topics_by_quarter['CustomName'].isin(['Corona','Internet, Wlan und Mobilfunk','Platzangebot Velo, Gepäck, Kinderwagen','Pünktlichkeit'])]


In [36]:
import plotly.graph_objects as go
import plotly.io as pio

# define a list of hex codes for the line colors
line_colors = color_discrete_kuzu

fig = go.Figure()

# add the traces to the figure and set the line color for each trace
for i in range(len(df_count_satisfaction['CustomName'].unique())):
    custom_name = df_count_satisfaction['CustomName'].unique()[i]
    df_custom = df_count_satisfaction[df_count_satisfaction['CustomName']==custom_name]
    fig.add_trace(go.Scatter(x=df_custom['yearquarter'], y=df_custom['Topic_Ratio'],
                    mode='lines',
                    name=custom_name,
                    line=dict(color=line_colors[i%len(line_colors)]))) # set line color for each trace

# add left and right annotations
for custom_name in df_count_satisfaction['CustomName'].unique():
    df_custom = df_count_satisfaction[df_count_satisfaction['CustomName']==custom_name]
    y_trace = df_custom['Topic_Ratio'].tolist()
    label = custom_name
    
    # fig.add_annotation(x=df_custom.iloc[-1]['yearquarter'], y=y_trace[-1],
    #                    xref="x", yref="y",
    #                    text=f"{label}",
    #                    showarrow=False,
    #                    font=dict(family='Arial', size=12),
    #                    xshift=50,
    #                    yshift=0)

# update layout and save/show figure
fig.update_layout(title="Themenrelevanz nach Quartal (2019 - 2022)",
                  xaxis_title='',
                  showlegend=True,
                  legend_title="",
                  width=900, 
                  height=450,
                  legend=dict(orientation='h', yanchor='top', y=1.1, xanchor='center', x=0.5),
                  #template='plotly_white'
                  ) # set template

fig.update_traces(line_width=2, text=df_count_satisfaction['CustomName'])

fig.update_yaxes(range=(0,0.25),title=dict(text='Relative Häufigkeit', font=dict(size=12)))

pio.write_image(fig,"../../exports/images/Themencluster_Zeitachse_1.svg") #save fig as svg
fig.show()

## Comaparison Satisfaction Scores

In [37]:
## Calculate unsatisfaction scores to compare with comment based counts

score_cols = [col for col in df.columns if col.startswith('wime_')]

grouped_df = (
    df.groupby('yearquarter')
    [score_cols]
    .mean()
    .apply(lambda x: (100 - x)/100)
    .rename(columns={col: col.replace('wime_', 'wime_unzufriedenheit_') for col in score_cols})
    .reset_index()
)

grouped_df_long = pd.melt(grouped_df, id_vars=['yearquarter'], var_name='name', value_name='value')

In [38]:
grouped_df_long.name.value_counts()

wime_unzufriedenheit_personal          16
wime_unzufriedenheit_komfort           16
wime_unzufriedenheit_sauberkeit        16
wime_unzufriedenheit_puenktlich        16
wime_unzufriedenheit_platzangebot      16
wime_unzufriedenheit_gesamtzuf         16
wime_unzufriedenheit_preis_leistung    16
wime_unzufriedenheit_fahrplan          16
wime_unzufriedenheit_oes_fahrt         16
Name: name, dtype: int64

In [39]:
df_topics_by_quarter_mod = df_topics_by_quarter[['yearquarter','CustomName','Topic_Ratio']]
df_topics_by_quarter_mod = df_topics_by_quarter_mod.rename(columns={'CustomName': 'name', 'Topic_Ratio': 'value'})

In [40]:
grouped_df_long_reduced = grouped_df_long[grouped_df_long['name'].isin(['wime_unzufriedenheit_sauberkeit','wime_unzufriedenheit_puenktlich','wime_unzufriedenheit_platzangebot','wime_unzufriedenheit_preis_leistung'])]
df_topics_by_quarter_mod_reduced = df_topics_by_quarter_mod[df_topics_by_quarter_mod['name'].isin(['Sauberkeit Zug und Bahnhof','Pünktlichkeit','Platzangebot Zug','Preis-Leistung'])]
df_count_satisfaction_selected  = pd.concat([grouped_df_long_reduced, df_topics_by_quarter_mod_reduced]).reset_index(drop=True)

In [41]:
df_count_satisfaction_selected_puenktlich = df_count_satisfaction_selected[df_count_satisfaction_selected['name'].isin(['wime_unzufriedenheit_puenktlich','Pünktlichkeit'])]

In [42]:
import plotly.graph_objects as go
import plotly.io as pio

# define a list of hex codes for the line colors
line_colors = color_discrete_kuzu

fig = go.Figure()

# add the traces to the figure and set the line color for each trace
for i in range(len(df_count_satisfaction_selected_puenktlich ['name'].unique())):
    custom_name = df_count_satisfaction_selected_puenktlich ['name'].unique()[i]
    df_custom = df_count_satisfaction_selected_puenktlich [df_count_satisfaction_selected_puenktlich ['name']==custom_name]
    fig.add_trace(go.Scatter(x=df_custom['yearquarter'], y=df_custom['value'],
                    mode='lines',
                    name=custom_name,
                    line=dict(color=line_colors[i%len(line_colors)]))) # set line color for each trace

# add left and right annotations
for custom_name in df_count_satisfaction_selected_puenktlich ['name'].unique():
    df_custom = df_count_satisfaction_selected_puenktlich [df_count_satisfaction_selected_puenktlich ['name']==custom_name]
    y_trace = df_custom['value'].tolist()
    label = custom_name
    
    # fig.add_annotation(x=df_custom.iloc[-1]['yearquarter'], y=y_trace[-1],
    #                    xref="x", yref="y",
    #                    text=f"{label}",
    #                    showarrow=False,
    #                    font=dict(family='Arial', size=12),
    #                    xshift=50,
    #                    yshift=0)

# update layout and save/show figure
fig.update_layout(title="Kommentarhäufigkeit und Unzufriedenheit mit Pünktlichkeit",
                  yaxis_title='Häufigkeit & Unzufriedenheit [log]',
                  xaxis_title='',
                  showlegend=True,
                  legend_title="",
                  width=900, 
                  height=450,
                  legend=dict(orientation='h', yanchor='top', y=1.1, xanchor='center', x=0.5),
                  #template='plotly_white'
                  ) # set template

fig.update_layout(yaxis_type='log')

fig.update_traces(line_width=2, text=df_count_satisfaction_selected_puenktlich ['name'])

fig.update_yaxes(title=dict(text='Relative Häufigkeit & Unzufriedenheit [log]', font=dict(size=12)))

pio.write_image(fig,"../../exports/images/Themencluster_Zeitachse_Pünktlichkeit.svg") #save fig as svg
fig.show()

In [44]:
df_count_satisfaction

Unnamed: 0,yearquarter,CustomName,Topic,count_x,count_y,Topic_Ratio
1,2019Q1,Corona,7,18,3694,0.004873
4,2019Q1,"Internet, Wlan und Mobilfunk",12,63,3694,0.017055
9,2019Q1,"Platzangebot Velo, Gepäck, Kinderwagen",9,157,3694,0.042501
12,2019Q1,Pünktlichkeit,0,728,3694,0.197076
20,2019Q2,Corona,7,19,3816,0.004979
...,...,...,...,...,...,...
278,2022Q3,Pünktlichkeit,0,924,4957,0.186403
286,2022Q4,Corona,7,43,4166,0.010322
289,2022Q4,"Internet, Wlan und Mobilfunk",12,133,4166,0.031925
294,2022Q4,"Platzangebot Velo, Gepäck, Kinderwagen",9,203,4166,0.048728


In [47]:
df_count_satisfaction_selected

Unnamed: 0,yearquarter,name,value
0,2019Q1,wime_unzufriedenheit_sauberkeit,0.254778
1,2019Q2,wime_unzufriedenheit_sauberkeit,0.259890
2,2019Q3,wime_unzufriedenheit_sauberkeit,0.261778
3,2019Q4,wime_unzufriedenheit_sauberkeit,0.268405
4,2020Q1,wime_unzufriedenheit_sauberkeit,0.249089
...,...,...,...
123,2022Q3,Sauberkeit Zug und Bahnhof,0.021989
124,2022Q4,Platzangebot Zug,0.134181
125,2022Q4,Preis-Leistung,0.066011
126,2022Q4,Pünktlichkeit,0.182909


In [55]:
df_count_satisfaction_selected_platz = df_count_satisfaction_selected[df_count_satisfaction_selected['name'].isin(['wime_unzufriedenheit_platzangebot','Platzangebot Zug'])]

In [57]:
import plotly.graph_objects as go
import plotly.io as pio

# define a list of hex codes for the line colors
line_colors = color_discrete_kuzu

fig = go.Figure()

# add the traces to the figure and set the line color for each trace
for i in range(len(df_count_satisfaction_selected_platz ['name'].unique())):
    custom_name = df_count_satisfaction_selected_platz ['name'].unique()[i]
    df_custom = df_count_satisfaction_selected_platz [df_count_satisfaction_selected_platz ['name']==custom_name]
    fig.add_trace(go.Scatter(x=df_custom['yearquarter'], y=df_custom['value'],
                    mode='lines',
                    name=custom_name,
                    line=dict(color=line_colors[i%len(line_colors)]))) # set line color for each trace

# add left and right annotations
for custom_name in df_count_satisfaction_selected_platz ['name'].unique():
    df_custom = df_count_satisfaction_selected_platz [df_count_satisfaction_selected_platz ['name']==custom_name]
    y_trace = df_custom['value'].tolist()
    label = custom_name
    
    # fig.add_annotation(x=df_custom.iloc[-1]['yearquarter'], y=y_trace[-1],
    #                    xref="x", yref="y",
    #                    text=f"{label}",
    #                    showarrow=False,
    #                    font=dict(family='Arial', size=12),
    #                    xshift=50,
    #                    yshift=0)

# update layout and save/show figure
fig.update_layout(title="Kommentarhäufigkeit und Unzufriedenheit mit Platzangebot",
                  yaxis_title='Häufigkeit & Unzufriedenheit [log]',
                  xaxis_title='',
                  showlegend=True,
                  legend_title="",
                  width=900, 
                  height=450,
                  legend=dict(orientation='h', yanchor='top', y=1.1, xanchor='center', x=0.5),
                  #template='plotly_white'
                  ) # set template

fig.update_layout(yaxis_type='log')

fig.update_traces(line_width=2, text=df_count_satisfaction_selected_platz ['name'])

fig.update_yaxes(title=dict(text='Relative Häufigkeit & Unzufriedenheit [log]', font=dict(size=12)))

pio.write_image(fig,"../../exports/images/Themencluster_Zeitachse_Platzangebot.svg") #save fig as svg
fig.show()

## Calculate the correlations between comment based counts and satisfaction scores

In [60]:
# Calculate unsatisfaction scores to compare with comment based counts

score_cols = [col for col in df.columns if col.startswith('wime_')]

grouped_df = (
    df.groupby('yearquarter')
    [score_cols]
    .mean()
    .rename(columns={col: col.replace('wime_', 'wime_') for col in score_cols})
    .reset_index()
)

df_satisfaction_by_month = pd.melt(grouped_df, id_vars=['yearquarter'], var_name='name', value_name='value')

In [61]:
# Create Dataframe for viz
df_topics_by_month = get_topic_ratios(df, timeframe_col='yearquarter', name_col='CustomName', topic_col='Topic')

In [62]:
df_temp_1 = df_satisfaction_by_month[df_satisfaction_by_month['name'].isin(['wime_sauberkeit','wime_puenktlich','wime_platzangebot','wime_preis_leistung'])]
df_temp_2 = df_topics_by_month[df_topics_by_month['CustomName'].isin(['Sauberkeit Zug und Bahnhof','Pünktlichkeit','Platzangebot Zug','Preis-Leistung'])]
df_temp_2 = df_temp_2[['yearquarter','CustomName','Topic_Ratio']]
df_temp_2  = df_temp_2 .rename(columns={'CustomName': 'name', 'Topic_Ratio': 'value'})
df_comparison = pd.concat([df_temp_1, df_temp_2]).reset_index(drop=True)

In [63]:
df_comparison

Unnamed: 0,yearquarter,name,value
0,2019Q1,wime_sauberkeit,74.522176
1,2019Q2,wime_sauberkeit,74.011049
2,2019Q3,wime_sauberkeit,73.822235
3,2019Q4,wime_sauberkeit,73.159493
4,2020Q1,wime_sauberkeit,75.091117
...,...,...,...
123,2022Q3,Sauberkeit Zug und Bahnhof,0.021989
124,2022Q4,Platzangebot Zug,0.134181
125,2022Q4,Preis-Leistung,0.066011
126,2022Q4,Pünktlichkeit,0.182909


In [64]:
# Calculate correlation matrix
# Pivot table to create wide format
df_count_satisfaction_pivot = df_comparison.pivot_table(index='yearquarter', columns='name', values='value')

# Calculate correlation matrix
corr_matrix = df_count_satisfaction_pivot.corr()
corr_matrix

name,Platzangebot Zug,Preis-Leistung,Pünktlichkeit,Sauberkeit Zug und Bahnhof,wime_platzangebot,wime_preis_leistung,wime_puenktlich,wime_sauberkeit
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Platzangebot Zug,1.0,-0.013901,0.735549,0.237312,-0.94399,-0.626703,-0.759952,-0.875544
Preis-Leistung,-0.013901,1.0,0.126526,0.472409,-0.078771,-0.53071,-0.138331,-0.164276
Pünktlichkeit,0.735549,0.126526,1.0,0.417177,-0.840585,-0.688798,-0.840326,-0.83059
Sauberkeit Zug und Bahnhof,0.237312,0.472409,0.417177,1.0,-0.293356,-0.606633,-0.341574,-0.436558
wime_platzangebot,-0.94399,-0.078771,-0.840585,-0.293356,1.0,0.75656,0.85191,0.9601
wime_preis_leistung,-0.626703,-0.53071,-0.688798,-0.606633,0.75656,1.0,0.611992,0.806294
wime_puenktlich,-0.759952,-0.138331,-0.840326,-0.341574,0.85191,0.611992,1.0,0.851463
wime_sauberkeit,-0.875544,-0.164276,-0.83059,-0.436558,0.9601,0.806294,0.851463,1.0


In [72]:
import numpy as np

# Define your custom colors
custom_colors = ['#F06969', '#ffffff', '#49787F']  # Replace with your desired color codes

# Create a custom colormap using your custom colors
colorscale = [[i / (len(custom_colors) - 1), color] for i, color in enumerate(custom_colors)]


# create correlation plot with lower triangle masked out
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
corr_matrix_masked = corr_matrix.mask(mask)
fig = px.imshow(corr_matrix_masked,
                color_continuous_scale=colorscale,
                zmin=-1,
                zmax=1,
                labels=dict(x='', y=''),
                title='Korrelationen - Zufriedenheitsbewertungen & Anzahl Kommentare nach Themencluster')

# add text annotations to cells
for i in range(len(corr_matrix)):
    for j in range(i+1): # only iterate over lower triangle
        fig.add_annotation(x=j, y=i,
                           text=str(round(corr_matrix.iloc[i, j], 2)),
                           font=dict(color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black'),
                           showarrow=False)

# update layout and save/show figure
fig.update_layout(width=800,
                  height=600,
                  xaxis=dict(tickangle=-90, tickfont=dict(size=12)),
                  template='plotly_white')
pio.write_image(fig, '../../exports/images/correlation_plot_Satisfactiom_Comments.svg')
fig.show()