# Topics by Customer Groups

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../')) ## needed to import the function.py file
from functions import *
import pandas as pd
import plotly.express as px
import plotly.io as pio

2023-03-14 15:26:35.248033: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
## Import docs
filelocation = '../../data/DataTextTrain'
data = pd.read_feather(filelocation)
data = data
docs = data["Kommentar"].values
timestamps = data.yearquarter.to_list()
gender = [data["S_sex"][i] for i in range(len(data["S_sex"]))]

# Load the Original Topic Model
topic_model = BERTopic.load("../../models/BERTTopic_paraphrase-multilingual-MiniLM-L12-v2_merged_oulier_reduced.model")
df_topic = topic_model.get_document_info(docs)


# Create full dataframe with all infomration
df = data.join(df_topic)



#### Add grouped Age col #####
df['S_alter'] = pd.to_numeric(df['S_alter'], errors='coerce')
bins = [0, 24, 44, 64, 100]
labels = ["1-24 Jahre", "25-44 Jahre", "45-64 Jahre", "65-100 Jahre"]
df['S_alter_grouped'] = pd.cut(df['S_alter'], bins=bins, labels=labels)

# Create Dataframe for viz
df_topics_by_quarter = get_topic_ratios(df, timeframe_col='yearquarter', name_col='CustomName', topic_col='Topic')



###### Convert the two columns to datetime format #####
df['fg_abfahrt'] = pd.to_datetime(df['fg_abfahrt'], format='%H:%M:%S')
df['fg_ankunft'] = pd.to_datetime(df['fg_ankunft'], format='%H:%M:%S')

# calculate the time difference between the two columns
df['time_diff'] = df['fg_ankunft'] - df['fg_abfahrt']

# define the time difference groups
time_groups = pd.cut(df['time_diff'], bins=[pd.Timedelta(minutes=0), pd.Timedelta(minutes=15), 
                                             pd.Timedelta(minutes=60), pd.Timedelta(minutes=180), 
                                             pd.Timedelta.max],
                    labels=['bis 15 Minuten', '16 bis 60 Minuten', '60 bis 180 Minuten', 
                            'Mehr als 180 Minuten'])

# add the time difference groups as a new column to the DataFrame
df['time_diff_grouped'] = time_groups

In [3]:
import pandas as pd

def compute_categorical_counts(df, categorical_col, name_col):
    """
    Compute the counts and relative counts for each combination of CustomName and Topic, 
    aggregated by the categorical column.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input data containing the columns for the categorical_col, name_col, and topic_col.
    categorical_col : str
        The name of the column containing the categorical information (e.g. year, quarter, month, etc.).
    name_col : str
        The name of the column containing the CustomName information.
    topic_col : str
        The name of the column containing the Topic information.
    
    Returns:
    --------
    pandas.DataFrame
        A new DataFrame with the columns 'categorical_col', 'CustomName', 'Topic', 'count', and 'relative_count'.
    """
    # Aggregate counts by categorical value, CustomName, and Topic
    df_topic_categorical = pd.DataFrame(df.groupby([categorical_col, name_col]).size().reset_index(name='count'))
    
    # Compute total count for each categorical value
    df_total_count = pd.DataFrame(df.groupby([categorical_col]).size().reset_index(name='total_count'))
    
    # Merge total count into topic counts DataFrame
    df_topic_categorical = df_topic_categorical.merge(df_total_count, on=categorical_col)
    
    # Compute relative count
    df_topic_categorical['relative_count'] = df_topic_categorical['count'] / df_topic_categorical['total_count']
    
    return df_topic_categorical[[categorical_col, name_col, 'count', 'relative_count']]

In [48]:
def create_grouped_barchart(df, x_col, y_col, color_col, color_discrete_sequence, ignore_group=None, title='', xaxis_title='', yaxis_title='', legend_title='', template=''):
    if ignore_group:
        df = df[df[color_col] != ignore_group]
    fig = px.bar(df,
                 x=x_col,
                 y=y_col,
                 color=color_col,
                 hover_data=[color_col, y_col,x_col],
                 color_discrete_sequence=color_discrete_sequence,
                 template=template,
                 barmode='group')
    
    fig.update_layout(
        width=900, 
        height=600,
        title=title,
        yaxis_title=yaxis_title,
        xaxis_title=xaxis_title,
        legend_title=legend_title,
        xaxis_tickangle=270
    )
    
    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.1)')
    
    fig.update_layout(legend=dict(orientation='h', yanchor='top', y=1.1, xanchor='center', x=0.5))
    
    fig.show()

In [36]:
import plotly.graph_objects as go

def create_grouped_radar(df, x_col, y_col, color_col, color_discrete_sequence, ignore_group=None, title='', xaxis_title='', yaxis_title='', legend_title='', template=''):
    fig = go.Figure()
    for color in df[color_col].unique():
        if color != ignore_group:
            fig.add_trace(go.Scatterpolar(
                r=df[df[color_col] == color][y_col].values.tolist(),
                theta=df[df[color_col] == color][x_col].values.tolist(),
                fill='none',
                name=color,
                line=dict(color=color_discrete_sequence[df[color_col].unique().tolist().index(color)]),
                showlegend=True,
                marker=dict(size=4)
            ))
    fig.update_layout(
        width=900, 
        height=650,
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, df[y_col].max()],
                showgrid=True,
                gridwidth=1,
                gridcolor='rgba(0,0,0,0.1)'
            ),
            angularaxis=dict(
                visible=True,
                tickmode='linear',
                tickfont=dict(size=10),
                showticklabels=True,
                gridcolor='rgba(0,0,0,0.1)'
            )
        ),
        showlegend=True,
        title=title,
        legend_title=legend_title,
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
        template=template
    )

    # fig.update_layout(legend=dict(orientation='h', yanchor='top', y=1.1, xanchor='center', x=0.5))

    fig.show()

## Prepare Aggregated dataframes for Viz

In [146]:
df_gender = compute_categorical_counts(df, categorical_col="S_sex", name_col="CustomName")
df_gender['S_sex'] = df_gender['S_sex'].cat.remove_categories(['divers']) # to remove the values comletly also in the categories of the object
df_gender = df_gender[df_gender["CustomName"] != 'Outlier']

df_agegroup = compute_categorical_counts(df, categorical_col="S_alter_grouped", name_col="CustomName")
df_agegroup  = df_agegroup[df_agegroup["CustomName"] != 'Outlier']

df_u_klassencode = compute_categorical_counts(df, categorical_col="u_klassencode", name_col="CustomName")
df_u_klassencode = df_u_klassencode[df_u_klassencode["CustomName"] != 'Outlier']

df_R_zweck = compute_categorical_counts(df, categorical_col="R_zweck", name_col="CustomName")
df_R_zweck = df_R_zweck[df_R_zweck["CustomName"] != 'Outlier']
df_R_zweck['R_zweck'] = df_R_zweck['R_zweck'].cat.remove_categories(['Sonstige']) # to remove the values comletly also in the categories of the object

df_reisezeit = compute_categorical_counts(df, categorical_col="time_diff_grouped", name_col="CustomName")
df_reisezeit= df_reisezeit[df_reisezeit["CustomName"] != 'Outlier']

df_season = compute_categorical_counts(df, categorical_col="season", name_col="CustomName")
df_season= df_season[df_season["CustomName"] != 'Outlier']

df_wohnsitz = compute_categorical_counts(df, categorical_col="S_wohnsitz", name_col="CustomName")
df_wohnsitz = df_wohnsitz[df_wohnsitz["CustomName"] != 'Outlier']

df_ft_tu = compute_categorical_counts(df, categorical_col="ft_tu", name_col="CustomName")
df_ft_tu['ft_tu'] = df_ft_tu['ft_tu'].astype('category')
df_ft_tu['ft_tu'] = df_ft_tu['ft_tu'].cat.remove_unused_categories()
df_ft_tu['ft_tu'] = df_ft_tu['ft_tu'].cat.set_categories(['SBB', 'BLS', 'SOB'])
df_ft_tu = df_ft_tu[df_ft_tu['ft_tu'].isin(['SBB', 'BLS', 'SOB'])]


df_ft_vm_kurz = compute_categorical_counts(df, categorical_col='ft_vm_kurz', name_col="CustomName")
df_ft_vm_kurz['ft_vm_kurz'] = df_ft_vm_kurz['ft_vm_kurz'].astype('category')
df_ft_vm_kurz['ft_vm_kurz'] = df_ft_vm_kurz['ft_vm_kurz'].cat.remove_unused_categories()
df_ft_vm_kurz['ft_vm_kurz'] = df_ft_vm_kurz['ft_vm_kurz'].cat.set_categories(['IC', 'IR', 'S','RE','EC'])
df_ft_vm_kurz = df_ft_vm_kurz[df_ft_vm_kurz['ft_vm_kurz'].isin(['IC', 'IR', 'S','RE','EC'])]


## Plot Charts

In [49]:
barchart_gender = create_grouped_barchart(df_gender, x_col="CustomName", y_col="relative_count", color_col="S_sex", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Geschlecht (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [37]:
create_grouped_radar(df_gender, x_col="CustomName", y_col="relative_count", color_col="S_sex", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Geschlecht (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [50]:
barchart_agegroup = create_grouped_barchart(df_agegroup, x_col="CustomName", y_col="relative_count", color_col="S_alter_grouped", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Altersgruppe (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [51]:
create_grouped_radar(df_agegroup, x_col="CustomName", y_col="relative_count", color_col="S_alter_grouped", color_discrete_sequence=color_discrete_sequence_mixed, title="Themenrelevanz nach Altersgruppe (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [58]:
barchart_klasse = create_grouped_barchart(df_u_klassencode, x_col="CustomName", y_col="relative_count", color_col="u_klassencode", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Klasse (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [23]:
create_grouped_radar(df_u_klassencode, x_col="CustomName", y_col="relative_count", color_col="u_klassencode", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Klasse (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [57]:
barchart_Reisezweck = create_grouped_barchart(df_R_zweck , x_col="CustomName", y_col="relative_count", color_col="R_zweck", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisezweck (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [55]:
create_grouped_radar(df_R_zweck, x_col="CustomName", y_col="relative_count", color_col="R_zweck", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisezweck (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [54]:
barchart_reisezeit = create_grouped_barchart(df_reisezeit, x_col="CustomName", y_col="relative_count", color_col="time_diff_grouped", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Thema', template=template)

In [16]:
create_grouped_radar(df_reisezeit, x_col="CustomName", y_col="relative_count", color_col="time_diff_grouped", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Gruppe', template=template)

In [59]:
create_grouped_barchart(df_wohnsitz, x_col="CustomName", y_col="relative_count", color_col="S_wohnsitz", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Wohnsitz (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Thema', template=template)

In [60]:
create_grouped_radar(df_wohnsitz, x_col="CustomName", y_col="relative_count", color_col="S_wohnsitz", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Reisedauer (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='Gruppe', template=template)

In [148]:
archart_ft_tu = create_grouped_barchart(df_ft_tu, x_col="CustomName", y_col="relative_count", color_col="ft_tu", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Transportunternehmen (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

In [150]:
archart_ft_vm_Kurz = create_grouped_barchart(df_ft_vm_kurz, x_col="CustomName", y_col="relative_count", color_col="ft_vm_kurz", color_discrete_sequence=color_discrete_sequence, title="Themenrelevanz nach Zugkategorie (2019-2022)", xaxis_title='', yaxis_title='Häufigkeit', legend_title='', template=template)

## Topic and Satisfaction

In [20]:
# group by CustomName and calculate the mean of wime_gesamtzuf, rounded to 1 decimal place
df_grouped = df.groupby('CustomName').agg(avg_value=('wime_gesamtzuf', lambda x: round(x.mean(), 1)), count=('wime_gesamtzuf', 'count')).reset_index().sort_values(by='avg_value')
df_grouped  = df_grouped[df_grouped ['CustomName'] != 'Outlier']

In [21]:
# Gesamtzufriedenheit aller Daten mit und Ohne Kommentar
filelocation = '../../data/DataClean'
datafull = pd.read_feather(filelocation)
datafull.wime_gesamtzuf.mean()

84.59906038864929

In [22]:
data.wime_gesamtzuf.mean()

78.8783601916629

In [26]:
df_grouped['delta'] = df_grouped['avg_value']-datafull.wime_gesamtzuf.mean().round(1)

In [33]:
df_grouped

Unnamed: 0,CustomName,avg_value,count,delta
10,Platzangebot Zug,72.6,4442,-12.0
3,Feedback Umfrage,74.9,717,-9.7
0,Ausstattung Qualität Rollmaterial,76.5,1080,-8.1
11,Preis-Leistung,76.6,3418,-8.0
12,Pünktlichkeit,76.9,6972,-7.7
14,Sauberkeit Zug und Bahnhof,77.4,1249,-7.2
13,Raumtemperatur Zug,77.6,1292,-7.0
1,Corona,77.8,1579,-6.8
6,Kundeninformation,78.1,993,-6.5
2,Fahrplanangebot,79.5,2206,-5.1
