# Topics by Customer Groups

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../')) ## needed to import the function.py file
from functions import *
import pandas as pd
import plotly.express as px
import plotly.io as pio

2023-05-26 07:34:02.995928: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Set colortheme
color_discrete_sequence=color_discrete_kuzu

In [4]:
## Import docs
filelocation = '../../data/DataTextTrain'
data = pd.read_feather(filelocation)
data = data
docs = data["Kommentar"].values
timestamps = data.yearquarter.to_list()
gender = [data["S_sex"][i] for i in range(len(data["S_sex"]))]

# Load the Original Topic Model
topic_model = BERTopic.load("../../models/BERTTopic_paraphrase-multilingual-MiniLM-L12-v2_merged_oulier_reduced.model")
df_topic = topic_model.get_document_info(docs)


# Create full dataframe with all infomration
df = data.join(df_topic)



#### Add grouped Age col #####
df['S_alter'] = pd.to_numeric(df['S_alter'], errors='coerce')
bins = [0, 24, 44, 64, 100]
labels = ["15-24 Jahre", "25-44 Jahre", "45-64 Jahre", "65-100 Jahre"]
df['S_alter_grouped'] = pd.cut(df['S_alter'], bins=bins, labels=labels)

# Create Dataframe for viz
df_topics_by_quarter = get_topic_ratios(df, timeframe_col='yearquarter', name_col='CustomName', topic_col='Topic')



###### Convert the two columns to datetime format #####
df['fg_abfahrt'] = pd.to_datetime(df['fg_abfahrt'], format='%H:%M:%S')
df['fg_ankunft'] = pd.to_datetime(df['fg_ankunft'], format='%H:%M:%S')

# calculate the time difference between the two columns
df['time_diff'] = df['fg_ankunft'] - df['fg_abfahrt']

# define the time difference groups
time_groups = pd.cut(df['time_diff'], bins=[pd.Timedelta(minutes=0), pd.Timedelta(minutes=15), 
                                             pd.Timedelta(minutes=60), pd.Timedelta(minutes=180), 
                                             pd.Timedelta.max],
                    labels=['bis 15 Minuten', '16 bis 60 Minuten', '61 bis 180 Minuten', 
                            'Mehr als 180 Minuten'])

# add the time difference groups as a new column to the DataFrame
df['time_diff_grouped'] = time_groups

In [5]:
ft_vm = df.ft_vm.value_counts()

In [6]:
# Create and extract IC Linien
def extract_linie(x):
    if x.startswith("IC"):
        return x[:4]
    elif x.startswith("S"):
        return x[:4]
    elif x.startswith("IR"):
        return x[:5]
    elif x.startswith("EC"):
        if x.startswith("ICE"):
            return x[:3]
        else:
            return x[:2]
    else:
        return x  # default case

df['linie'] = df['ft_vm'].apply(lambda x: extract_linie(x))
df['linie']= df['linie'].str.replace('-', ' ')
df_linie = df[df['linie'].isin(['IC 1', 'IC 2', 'IC 3', 'IC 5', 'IC 6', 'IC 8'])]

## Prepare Aggregated dataframes for Viz

In [7]:
df_gender = compute_categorical_counts(df, categorical_col="S_sex", name_col="CustomName")
df_gender['S_sex'] = df_gender['S_sex'].cat.remove_categories(['divers']) # to remove the values comletly also in the categories of the object
df_gender = df_gender[df_gender["CustomName"] != 'Outlier']
df_gender = df_gender.dropna(subset=["S_sex"])  # Remove rows with NaN values in the "S_sex" column
# df_gender = df_gender.sort_values(by='count',ascending=False)

df_agegroup = compute_categorical_counts(df, categorical_col="S_alter_grouped", name_col="CustomName")
df_agegroup  = df_agegroup[df_agegroup["CustomName"] != 'Outlier']
# df_agegroup = df_agegroup.sort_values(by='count',ascending=False)

df_u_klassencode = compute_categorical_counts(df, categorical_col="u_klassencode", name_col="CustomName")
df_u_klassencode = df_u_klassencode[df_u_klassencode["CustomName"] != 'Outlier']
# df_u_klassencode = df_u_klassencode.sort_values(by='count',ascending=False)

df_R_zweck = compute_categorical_counts(df, categorical_col="R_zweck", name_col="CustomName")
df_R_zweck = df_R_zweck[df_R_zweck["CustomName"] != 'Outlier']
df_R_zweck['R_zweck'] = df_R_zweck['R_zweck'].cat.remove_categories(['Sonstige']) # to remove the values comletly also in the categories of the object
df_R_zweck = df_R_zweck.dropna(subset=["R_zweck"])  # Remove rows with NaN values
# df_R_zweck = df_R_zweck.sort_values(by='count',ascending=False)

df_reisezeit = compute_categorical_counts(df, categorical_col="time_diff_grouped", name_col="CustomName")
df_reisezeit= df_reisezeit[df_reisezeit["CustomName"] != 'Outlier']
# df_reisezeit = df_reisezeit.sort_values(by='count',ascending=False)

df_season = compute_categorical_counts(df, categorical_col="season", name_col="CustomName")
df_season= df_season[df_season["CustomName"] != 'Outlier']
# df_season = df_season.sort_values(by='count',ascending=False)


df_wohnsitz = compute_categorical_counts(df, categorical_col="S_wohnsitz", name_col="CustomName")
df_wohnsitz = df_wohnsitz[df_wohnsitz["CustomName"] != 'Outlier']
# df_wohnsitz = df_wohnsitz.sort_values(by='count',ascending=False)

df_ft_tu = compute_categorical_counts(df, categorical_col="ft_tu", name_col="CustomName")
df_ft_tu['ft_tu'] = df_ft_tu['ft_tu'].astype('category')
df_ft_tu['ft_tu'] = df_ft_tu['ft_tu'].cat.remove_unused_categories()
df_ft_tu['ft_tu'] = df_ft_tu['ft_tu'].cat.set_categories(['SBB', 'BLS', 'SOB'])
df_ft_tu = df_ft_tu[df_ft_tu['ft_tu'].isin(['SBB', 'BLS', 'SOB'])]
df_ft_tu = df_ft_tu[df_ft_tu["CustomName"] != 'Outlier']
# df_ft_tu = df_ft_tu.sort_values(by='count',ascending=False)


df_ft_vm_kurz = compute_categorical_counts(df, categorical_col='ft_vm_kurz', name_col="CustomName")
df_ft_vm_kurz['ft_vm_kurz'] = df_ft_vm_kurz['ft_vm_kurz'].astype('category')
df_ft_vm_kurz['ft_vm_kurz'] = df_ft_vm_kurz['ft_vm_kurz'].cat.remove_unused_categories()
df_ft_vm_kurz['ft_vm_kurz'] = df_ft_vm_kurz['ft_vm_kurz'].cat.set_categories(['IC', 'IR', 'S','RE','EC'])
df_ft_vm_kurz = df_ft_vm_kurz[df_ft_vm_kurz['ft_vm_kurz'].isin(['IC', 'IR', 'S','RE','EC'])]
df_ft_vm_kurz = df_ft_vm_kurz[df_ft_vm_kurz["CustomName"] != 'Outlier']
# df_ft_vm_kurz = df_ft_vm_kurz.sort_values(by='count',ascending=False)

df_u_ticket = compute_categorical_counts(df, categorical_col="u_ticket", name_col="CustomName")
df_u_ticket['u_ticket'] = df_u_ticket['u_ticket'].astype('category')
df_u_ticket['u_ticket'] = df_u_ticket['u_ticket'].cat.remove_unused_categories()
df_u_ticket['u_ticket'] = df_u_ticket['u_ticket'].cat.set_categories(['Mobile-Ticket', 'Online-Ticket', 'Easy Ride'])
df_u_ticket = df_u_ticket[df_u_ticket['u_ticket'].isin(['Mobile-Ticket', 'Online-Ticket', 'Easy Ride'])]
df_u_ticket = df_u_ticket[df_u_ticket["CustomName"] != 'Outlier']


df_ft_vm = compute_categorical_counts(df_linie, categorical_col="linie", name_col="CustomName")
#df_ft_vm['S_sex'] = df_ft_vm['ft_vm'].cat.remove_categories(['divers']) # to remove the values comletly also in the categories of the object
df_ft_vm = df_ft_vm[df_ft_vm["CustomName"] != 'Outlier']


## Plot Charts

In [85]:
import plotly.io as pio

def create_grouped_barchart(df, x_col, y_col, color_col, color_discrete_sequence, ignore_group=None, title='', xaxis_title='', yaxis_title='', legend_title='', template='', path='', filename=''):
    if ignore_group:
        df = df[df[color_col] != ignore_group]
    fig = px.bar(df,
                 x=x_col,
                 y=y_col,
                 color=color_col,
                 hover_data=[color_col, y_col, x_col],
                 color_discrete_sequence=color_discrete_sequence,
                 barmode='group')

    fig.update_layout(
        width=900,
        height=600,
        title=title,
        yaxis_title=yaxis_title,
        xaxis_title=xaxis_title,
        legend_title=legend_title,
    )

    fig.update_xaxes(showgrid=False, tickmode='linear', tickangle=90, tickfont=dict(size=12), tickwidth=1)
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.1)')
    fig.update_yaxes(title_font=dict(size=12))

    # Wrap long x-axis labels on two lines and rotate by 270 degrees
    fig.update_layout(yaxis_range=[0, 0.29])
    fig.update_layout(
        xaxis=dict(
            tickmode='array',
            tickvals=list(range(len(df[x_col]))),
            ticktext=[x.replace(' ', '<br>') if len(x) > 40 else x for x in df[x_col]],
            automargin=True,
            tickangle=45,
            tickfont=dict(size=12),
        )
    )
    fig.update_yaxes(title=dict(text='Relative Häufigkeit', font=dict(size=12)))

    if path and filename:
        full_path = f"{path}/{filename}.svg"
        pio.write_image(fig, full_path, format='svg')

    fig.show()


In [86]:
# Image Path
path="../../exports/images/"

In [87]:
create_grouped_barchart(df_gender,
                        x_col="CustomName",
                        y_col="relative_count",
                        color_col="S_sex",
                        color_discrete_sequence=color_discrete_sequence,
                        title="Themenrelevanz nach Geschlecht (2019-2022)",
                        xaxis_title='',
                        yaxis_title='',
                        legend_title='',
                        template=template,
                        path=path,
                        filename="Themenrelevanz_Geschlecht"
                        )

In [80]:
# create_grouped_radar(df_gender, x_col="CustomName", y_col="relative_count", color_col="S_sex", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Geschlecht (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [88]:
barchart_agegroup = create_grouped_barchart(df_agegroup,
                                            x_col="CustomName",
                                            y_col="relative_count",
                                            color_col="S_alter_grouped",
                                            color_discrete_sequence=color_discrete_sequence,
                                            title="Themenrelevanz nach Altersgruppe (2019-2022)",
                                            xaxis_title='',
                                            yaxis_title='',
                                            legend_title='',
                                            template=template,
                                            path=path,
                                            filename="Themenrelevanz_Altersgruppe")

In [123]:
# create_grouped_radar(df_agegroup, x_col="CustomName", y_col="relative_count", color_col="S_alter_grouped", color_discrete_sequence=color_discrete_sequence_mixed, title="Themenrelevanz nach Altersgruppe (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [89]:
barchart_klasse = create_grouped_barchart(df_u_klassencode,
                                          x_col="CustomName",
                                          y_col="relative_count",
                                          color_col="u_klassencode",
                                          color_discrete_sequence=color_discrete_sequence,
                                          title="Themenrelevanz nach Reiseklasse (2019-2022)",
                                          xaxis_title='',
                                          yaxis_title='Relative Häufigkeit',
                                          legend_title='',
                                          template=template,
                                          path=path,
                                          filename="Themenrelevanz_Reiseklasse")

In [125]:
# create_grouped_radar(df_u_klassencode, x_col="CustomName", y_col="relative_count", color_col="u_klassencode", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Klasse (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [25]:
barchart_Reisezweck = create_grouped_barchart(df_R_zweck ,
                                              x_col="CustomName",
                                              y_col="relative_count",
                                              color_col="R_zweck",
                                              color_discrete_sequence=color_discrete_sequence,
                                              title="Themenrelevanz nach Reisezweck (2019-2022)",
                                              xaxis_title='',
                                              yaxis_title='Relative Häufigkeit',
                                              legend_title='',
                                              template=template,
                                              path=path,
                                              filename="Themenrelevanz_Reisezweck")

In [127]:
# create_grouped_radar(df_R_zweck, x_col="CustomName", y_col="relative_count", color_col="R_zweck", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisezweck (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [27]:
barchart_reisezeit = create_grouped_barchart(df_reisezeit,
                                             x_col="CustomName",
                                             y_col="relative_count",
                                             color_col="time_diff_grouped",
                                             color_discrete_sequence=color_discrete_sequence,
                                             title="Themenrelevanz nach Reisedauer (2019-2022)",
                                             xaxis_title='',
                                             yaxis_title='Relative Häufigkeit',
                                             legend_title='',
                                             template=template,
                                             path=path,
                                             filename="Themenrelevanz_Reisedauer")

In [28]:
# create_grouped_radar(df_reisezeit, x_col="CustomName", y_col="relative_count", color_col="time_diff_grouped", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Gruppe', template=template)

In [29]:
barchart_wohnsitz = create_grouped_barchart(df_wohnsitz,
                                            x_col="CustomName",
                                            y_col="relative_count",
                                            color_col="S_wohnsitz",
                                            color_discrete_sequence=color_discrete_sequence,
                                            title="Themenrelevanz nach Wohnsitz (2019-2022)",
                                            xaxis_title='',
                                            yaxis_title='Relative Häufigkeit',
                                            legend_title='',
                                            template=template,
                                            path=path,
                                            filename="Themenrelevanz_Wohnsitz")


In [131]:
# create_grouped_radar(df_wohnsitz, x_col="CustomName", y_col="relative_count", color_col="S_wohnsitz", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Gruppe', template=template)

In [41]:
barchart_ft_tu = create_grouped_barchart(df_ft_tu,
                                         x_col="CustomName",
                                         y_col="relative_count",
                                         color_col="ft_tu",
                                         color_discrete_sequence=color_discrete_sequence,
                                         title="Themenrelevanz nach Transportunternehmen (2019-2022)",
                                         xaxis_title='',
                                         yaxis_title='Relative Häufigkeit',
                                         legend_title='',
                                         template=template,
                                         path=path,
                                        filename="Themenrelevanz_Tu")

In [42]:
create_grouped_barchart(df_ft_vm_kurz,
                        x_col="CustomName",
                        y_col="relative_count",
                        color_col="ft_vm_kurz",
                        color_discrete_sequence=color_discrete_sequence,
                        title="Themenrelevanz nach Zugkategorie (2019-2022)",
                        xaxis_title='',
                        yaxis_title='Relative Häufigkeit',
                        legend_title='',
                        template=template,
                        path=path,
                        filename="Themenrelevanz_Verkehrsmittel")

In [43]:
create_grouped_barchart(df_ft_vm,
                        x_col="CustomName",
                        y_col="relative_count",
                        color_col="linie",
                        color_discrete_sequence=color_discrete_sequence,
                        title="Themenrelevanz nacc IC Verbindungen (2019-2022)",
                        xaxis_title='',
                        yaxis_title='Relative Häufigkeit',
                        legend_title='',
                        template=template,
                        path=path,
                        filename="Themenrelevanz_ICs")

In [33]:
fahrplanangebot_kommentare = df_linie[(df_linie['linie']=='IC 2') & (df_linie['CustomName']=='Fahrplanangebot')]['Kommentar']

In [35]:
df_linie.groupby('linie').agg(avg_value=('wime_gesamtzuf', lambda x: round(x.mean(), 1)), count=('wime_gesamtzuf', 'count')).reset_index().sort_values(by='avg_value')

Unnamed: 0,linie,avg_value,count
0,IC 1,75.5,1976
5,IC 8,77.3,2654
4,IC 6,77.7,1717
1,IC 2,77.9,960
3,IC 5,78.5,2759
2,IC 3,79.4,1804


In [36]:
df.time_diff

0       0 days 01:22:00
1       0 days 00:47:00
2       0 days 00:56:00
3       0 days 00:41:00
4       0 days 00:47:00
              ...      
45036   0 days 02:17:00
45037   0 days 00:53:00
45038   0 days 00:27:00
45039   0 days 00:28:00
45040   0 days 00:19:00
Name: time_diff, Length: 45041, dtype: timedelta64[ns]

In [37]:
df_linie.groupby('linie').agg(avg_value=('time_diff', lambda x: round(x[x >= pd.Timedelta(0)].median().total_seconds()/60)))

Unnamed: 0_level_0,avg_value
linie,Unnamed: 1_level_1
IC 1,93
IC 2,132
IC 3,111
IC 5,77
IC 6,78
IC 8,96


In [38]:
df_linie.groupby('linie').agg(avg_value=('u_preis', lambda x: round(x[x <= 500].median())))
#df_linie.groupby('linie').agg(avg_value=('S_alter', lambda x: round(x[x <= 98].median())))

Unnamed: 0_level_0,avg_value
linie,Unnamed: 1_level_1
IC 1,32
IC 2,32
IC 3,31
IC 5,23
IC 6,26
IC 8,33


In [44]:
create_grouped_barchart(df_u_ticket,
                        x_col="CustomName",
                        y_col="relative_count",
                        color_col="u_ticket",
                        color_discrete_sequence=color_discrete_sequence,
                        title="Themenrelevanz nach Ticket (2019-2022)",
                        xaxis_title='',
                        yaxis_title='',
                        legend_title='',
                        template=template,
                        path=path,
                        filename="Themenrelevanz_Ticket")

## Topic and Satisfaction

In [45]:
# group by CustomName and calculate the mean of wime_gesamtzuf, rounded to 1 decimal place
df_grouped = df.groupby('CustomName').agg(avg_value=('wime_gesamtzuf', lambda x: round(x.mean(), 1)), count=('wime_gesamtzuf', 'count')).reset_index().sort_values(by='avg_value')
df_grouped  = df_grouped[df_grouped ['CustomName'] != 'Outlier']

In [46]:
# Gesamtzufriedenheit aller Daten mit und Ohne Kommentar
filelocation = '../../data/DataClean'
datafull = pd.read_feather(filelocation)
datafull.wime_gesamtzuf.mean()

84.59906038864929

In [47]:
df[df.ft_vm_kurz=='S'].time_diff.mean()

Timedelta('0 days 00:31:59.549575899')

In [48]:
df[df.u_ticket=='Mobile-Ticket'].S_alter.mean()

50.8675996778091

In [58]:
data.wime_gesamtzuf.mean()

78.8783601916629

In [49]:
df_grouped['delta'] = df_grouped['avg_value']-datafull.wime_gesamtzuf.mean().round(1)

## Personas

In [30]:
## Christina
df_Christina = df[(df['S_alter_grouped']=='45-64 Jahre') & 
                  (df['u_klassencode']=='2. Klasse') & 
                  (df['S_AB3_HTA']=='ja') &
                  (df['S_sex']=='weiblich')].reset_index()
df_Christina['Persona'] = 'Christina'

df_Christina = compute_categorical_counts(df_Christina, categorical_col='Persona', name_col="CustomName")
df_Christina = df_Christina[df_Christina["CustomName"] != 'Outlier']

## Christina
df_Lea = df[(df['S_alter_grouped']=='45-64 Jahre') & 
                  (df['u_klassencode']=='2. Klasse') & 
                  (df['u_fahrausweis']=='GA') &
                  (df['S_sex']=='weiblich')].reset_index()
df_Lea['Persona'] = 'Lea'

df_Lea = compute_categorical_counts(df_Lea, categorical_col='Persona', name_col="CustomName")
df_Lea = df_Lea[df_Lea["CustomName"] != 'Outlier']

## Andreas
df_Andreas = df[(df['S_alter_grouped']=='45-64 Jahre') & 
                  (df['u_klassencode']=='1. Klasse') & 
                  (df['u_fahrausweis']=='Normales Billett') &
                  (df['S_sex']=='männlich')].reset_index()
df_Andreas['Persona'] = 'Andreas'

df_Andreas = compute_categorical_counts(df_Andreas, categorical_col='Persona', name_col="CustomName")
df_Andreas = df_Andreas[df_Andreas["CustomName"] != 'Outlier']

## Francesco
df_Francesco = df[(df['S_alter_grouped']=='25-44 Jahre') & 
                  (df['u_klassencode']=='2. Klasse') & 
                  (df['S_AB3_HTA']=='ja') &
                  (df['S_sex']=='männlich')].reset_index()
df_Francesco['Persona'] = 'Francesco'

df_Francesco = compute_categorical_counts(df_Francesco, categorical_col='Persona', name_col="CustomName")
df_Francesco = df_Francesco[df_Francesco["CustomName"] != 'Outlier']




df_persona = df_Christina.append(df_Lea)
df_persona = df_persona.append(df_Andreas)
df_persona = df_persona.append(df_Francesco)

In [105]:
create_grouped_barchart(df_persona, x_col="CustomName", y_col="relative_count", color_col='Persona', color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Gruppe', template=template)

In [31]:
create_grouped_radar(df_persona, x_col="CustomName", y_col="relative_count", color_col='Persona', color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Gruppe', template=template)