# EDA

In this EDA, the study dataset are examined in more detail. The goal is to understand the data and the structure of it better.


In [33]:
import pandas as pd
import os
import plotly.express as px
import re
import numpy as np
import plotly.graph_objs as go
from IPython.display import display
from IPython.display import Markdown as md
import plotly.io as pio
from pandas_profiling import ProfileReport
from globalvars import *
from functions import *
import kaleido

### Load Data

In [2]:
## Import dataframe
filelocation = 'data/DataClean'
df = pd.read_feather(filelocation)

## load config file
config = pd.read_excel('config/config.xlsx',sheet_name='fragecodes')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230892 entries, 0 to 230891
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   participant_id       230892 non-null  int64         
 1   u_date               230892 non-null  datetime64[ns]
 2   Kommentar            62303 non-null   object        
 3   wime_personal        79308 non-null   float64       
 4   wime_komfort         179566 non-null  float64       
 5   wime_sauberkeit      182781 non-null  float64       
 6   wime_puenktlich      183408 non-null  float64       
 7   wime_platzangebot    184204 non-null  float64       
 8   wime_gesamtzuf       191561 non-null  float64       
 9   wime_preis_leistung  215356 non-null  float64       
 10  wime_fahrplan        222515 non-null  float64       
 11  wime_oes_fahrt       175410 non-null  float64       
 12  S_sprache            230892 non-null  category      
 13  S_alter       

## EDA

Possiblity to filter on a specific timerange. The filter is applied the whole EDA.

In [7]:
startdate = '2019-01-01'
enddate = '2022-10-31'

# Apply time selection for all DataFrames
df= df[(df['u_date'] > startdate) & (df['u_date'] < enddate)]

### Overview

#### General DataFrame Overview

In [8]:
df = df.sort_values("u_date",ascending=False)

In [9]:
df.head(5)

Unnamed: 0,participant_id,u_date,Kommentar,wime_personal,wime_komfort,wime_sauberkeit,wime_puenktlich,wime_platzangebot,wime_gesamtzuf,wime_preis_leistung,...,ft_vm_kurz,ft_zielort_uic,fg_abfahrt,fg_ankunft,fg_startort_uic,fg_zielort_uic,fg_startort,fg_zielort,ft_startort,ft_zielort
226335,583619,2022-10-30,,,25.0,75.0,75.0,100.0,100.0,100.0,...,IC,8500113.0,2022-11-22 16:49:00,2022-11-22 18:10:00,8504300.0,8500117.0,Biel/Bienne,Aesch BL,Biel/Bienne,Laufen
228498,586617,2022-10-30,,100.0,75.0,75.0,75.0,75.0,100.0,100.0,...,IC,8507483.0,2022-11-22 07:56:00,2022-11-22 10:02:00,8500010.0,8507483.0,Basel SBB,Spiez,Basel SBB,Spiez
228374,586445,2022-10-30,,75.0,100.0,100.0,100.0,100.0,100.0,100.0,...,IR,8505113.0,2022-11-22 15:28:00,2022-11-22 17:27:00,8503102.0,8505892.0,Erlenbach ZH,"Altdorf UR, Telldenkmal",Zürich HB,Altdorf UR
228373,586444,2022-10-30,,,100.0,100.0,75.0,100.0,100.0,100.0,...,IR,8505000.0,2022-11-22 15:11:00,2022-11-22 17:30:00,8502007.0,8505300.0,Sursee,Lugano,Sursee,Luzern
226237,583484,2022-10-30,,,,,,,,100.0,...,,,NaT,NaT,,,Solothurn,Olten,Solothurn,Olten


In [10]:
df.shape

(226537, 42)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226537 entries, 226335 to 913
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   participant_id       226537 non-null  int64         
 1   u_date               226537 non-null  datetime64[ns]
 2   Kommentar            61160 non-null   object        
 3   wime_personal        77463 non-null   float64       
 4   wime_komfort         176142 non-null  float64       
 5   wime_sauberkeit      179305 non-null  float64       
 6   wime_puenktlich      179916 non-null  float64       
 7   wime_platzangebot    180701 non-null  float64       
 8   wime_gesamtzuf       188063 non-null  float64       
 9   wime_preis_leistung  211236 non-null  float64       
 10  wime_fahrplan        218347 non-null  float64       
 11  wime_oes_fahrt       172092 non-null  float64       
 12  S_sprache            226537 non-null  category      
 13  S_alter     

In [12]:
## Add additional date columns for easier vizualization
df.insert(loc=2, column = 'year', value=df['u_date'].dt.year) #create additional year col for viz
df.insert(loc=3, column = 'month', value=df['u_date'].dt.month) #create additional month col for viz
df['yearmonth'] = pd.to_datetime(df[['year', 'month']].assign(DAY=1)) #create additional yearmonth col for viz

### TextData

In [13]:

df_text = df.dropna(subset=["Kommentar"])

#### Number of comments per month

In [14]:
df_text_counts = pd.DataFrame(df_text.groupby(['yearmonth'])['participant_id'].count()).reset_index()
df_participantstotals_counts = pd.DataFrame(df.groupby(['yearmonth'])['participant_id'].count()).reset_index()

In [15]:
# Merge Dataframes
df_counts_month = df_text_counts.merge(df_participantstotals_counts, on='yearmonth', how='left')
df_counts_month.rename(columns = {'participant_id_x':'Anzahl Kommentare','participant_id_y':'Anzahl Befragte'}, inplace = True)
df_counts_month['Kommentar Ratio in %'] = (df_counts_month['Anzahl Kommentare']/df_counts_month['Anzahl Befragte'])*100

In [16]:
## plot settings
color_discrete_sequence = color_discrete_sequence #imported from globalvars.py
template = template ##imported from globalvars.py

In [34]:
fig = px.line(df_counts_month,
              x="yearmonth",
              y=['Anzahl Kommentare', 'Anzahl Befragte'],
              color_discrete_sequence=color_discrete_sequence,
              template=template)

fig.update_layout(
    title="Anzahl Befragte und Anzahl Kommentare im Vergleich",
    yaxis_title='Anzahl',
    xaxis_title='Monat',
    height=300,
    legend_title="")

fig.update_traces(line_width=2)
fig.update_traces(patch={"line": {"dash": 'dot'}}, selector={"legendgroup": "Anzahl Befragte"}) 

fig.show()

pio.write_image(fig,"images/fig-anteil_befragte_und_anzahl_kommentare.svg") #save fig as png

ValueError: Failed to start Kaleido subprocess. Error stream:

/Users/dominik/Documents/Masterarbeit Code/nlp-satisfaction/venv/lib/python3.9/site-packages/kaleido/executable/kaleido: line 4: cd: /Users/dominik/Documents/Masterarbeit: No such file or directory
/Users/dominik/Documents/Masterarbeit Code/nlp-satisfaction/venv/lib/python3.9/site-packages/kaleido/executable/kaleido: line 5: ./bin/kaleido: No such file or directory


In [32]:
fig = px.line(df_counts_month,
              x="yearmonth",
              y='Kommentar Ratio in %',
              color_discrete_sequence=color_discrete_sequence,
              template=template)
fig.update_layout(
    title="Anteil Befragte mit Kommentar",
    yaxis_title='Anteil in %',
    xaxis_title='Monat',
    height=300
    )
fig.update_traces(line_width=2)
#fig.update_yaxes(range=(0,40))

# show legend for singel line plots
fig['data'][0]['showlegend'] = True
fig['data'][0]['name'] = 'Anteil Kommentare in %'

fig.show()

pio.write_image(fig,"images/fig-anteil-befragte-mit-kommentar.svg") #save fig as png

ValueError: Failed to start Kaleido subprocess. Error stream:

/Users/dominik/Documents/Masterarbeit Code/nlp-satisfaction/venv/lib/python3.9/site-packages/kaleido/executable/kaleido: line 4: cd: /Users/dominik/Documents/Masterarbeit: No such file or directory
/Users/dominik/Documents/Masterarbeit Code/nlp-satisfaction/venv/lib/python3.9/site-packages/kaleido/executable/kaleido: line 5: ./bin/kaleido: No such file or directory


In [17]:
df_counts_month.describe()

Unnamed: 0,Anzahl Kommentare,Anzahl Befragte,Kommentar Ratio in %
count,45.0,45.0,45.0
mean,1359.111111,5034.155556,26.800624
std,524.873501,1801.099903,2.518035
min,125.0,456.0,22.506454
25%,990.0,3952.0,24.677835
50%,1325.0,5097.0,26.476602
75%,1703.0,5875.0,28.435465
max,2698.0,9898.0,31.907954


#### Minimal, maximal and average length of Kommentar

In [18]:
## Add basic text features
df_text["Kommentar"] = remove_redundant_whitespaces(df_text["Kommentar"]) #note: imported function "remove_redundant_whitespaces"
df_text = add_basic_textfeatures(df_text,"Kommentar")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [19]:
df_text_average_tokens_median_per_month = pd.DataFrame(df_text.groupby(['yearmonth'])['Kommentar_Tokens'].median()).reset_index()
df_text_average_tokens_mean_per_month = pd.DataFrame(df_text.groupby(['yearmonth'])['Kommentar_Tokens'].mean()).reset_index()

In [20]:
# Merge Dataframes
df_text_tokens_counts_month = df_text_average_tokens_median_per_month.merge(df_text_average_tokens_mean_per_month, on='yearmonth', how='left')
df_text_tokens_counts_month.rename(columns = {'Kommentar_Tokens_x':'Median Tokens','Kommentar_Tokens_y':'Mittelwert Tokens'}, inplace = True)

In [19]:
fig = px.line(df_text_tokens_counts_month,
              x="yearmonth",
              y=['Median Tokens','Mittelwert Tokens'],
              color_discrete_sequence=color_discrete_sequence,
              template=template)
fig.update_layout(
    title="Freitextkommentare: Token-Median & Token-Mittelwert in Vergleich",
    yaxis_title='Anzahl',
    xaxis_title='Monat',
    height=300,
    legend_title="")
fig.update_traces(line_width=2)
fig.update_yaxes(range=(0,35))


fig.show()

pio.write_image(fig,"images/fig-token-median-mittelwert.svg") #save fig as png

NameError: name 'df_text_tokens_counts_month' is not defined

In [22]:
fig = px.histogram(
    df_text,
    x="Kommentar_Tokens",
    range_x = [0,150],
    title="Freitextkommentare: Häufigkeitsverteilung Tokens",
    #color = "year",
    labels={'Kommentar_Tokens':'Anzahl Tokens'},
    color_discrete_sequence=color_discrete_sequence,
    height=300,
    template=template)
    
fig.update_layout(yaxis_title="Anzahl Kommentare")

# Overlay both histograms
#fig.update_layout(barmode='overlay')

# Reduce opacity to see both histograms
#fig.update_traces(opacity=0.5)

fig.show()

pio.write_image(fig,"images/fig-token-hist.svg") #save fig as png

In [23]:
df_text.describe()

Unnamed: 0,participant_id,year,month,wime_personal,wime_komfort,wime_sauberkeit,wime_puenktlich,wime_platzangebot,wime_gesamtzuf,wime_preis_leistung,wime_fahrplan,S_alter,u_preis,Kommentar_Character,Kommentar_Tokens,Kommentar_Types,Kommentar_TTR
count,61160.0,61160.0,61160.0,22619.0,48970.0,49734.0,49907.0,50202.0,52247.0,57243.0,60302.0,61008.0,53669.0,61160.0,61160.0,61160.0,61083.0
mean,360541.658192,2020.352322,6.526897,88.179186,74.256234,76.602916,86.984513,75.49915,78.730996,67.822606,78.240587,51.91788,43.719585,173.376619,26.893035,23.980592,94.696425
std,175914.277284,1.18028,3.296238,19.900731,25.702345,23.184245,24.39282,30.271958,23.354511,29.247308,27.671276,15.583001,276.397018,181.91099,28.603849,22.566264,6.905733
min,41305.0,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,33.333333
25%,120252.75,2019.0,4.0,77.777778,55.555556,66.666667,77.777778,55.555556,75.0,50.0,66.666667,41.0,8.8,58.0,9.0,8.0,90.697674
50%,406374.5,2020.0,7.0,100.0,75.0,77.777778,100.0,88.888889,77.777778,75.0,88.888889,54.0,17.0,119.0,18.0,17.0,100.0
75%,501831.25,2021.0,9.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,64.0,30.6,219.0,34.0,32.0,100.0
max,587992.0,2022.0,12.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,98.0,6300.0,2047.0,347.0,230.0,100.0


- Median of 122 Characters per Comment (140 original Twitter length)
- Median of 19 Tokens per comment.
- Median TTR of 98.

In [24]:
# Pandas Profiling TextData
profile = ProfileReport(
    df_text[
        [
            #"participant_id",
            #"u_date",
            "Kommentar",
            "Kommentar_Character",
            "Kommentar_TTR",
            "Kommentar_Tokens",
            "Kommentar_Types",
        ]
    ],
    title="TextData",
    lazy=False,
    dark_mode=True,
)

profile.to_file("html/ProfilingTextData.html")



iteritems is deprecated and will be removed in a future version. Use .items instead.



Summarize dataset: 100%|██████████| 46/46 [00:16<00:00,  2.87it/s, Completed]                                       
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 267.60it/s]


### Satisfaction Data

In [51]:
satisfaction_codes = config[config['vartype'] == 'satisfaction']['fragecode']

In [52]:
# Pandas Profiling TextData
profile = ProfileReport(
    df[
        satisfaction_codes
    ],
    title="SatisfactionData",
    lazy=False,
    dark_mode=True,
)

profile.to_file("html/ProfilingSatisfactionData.html")



iteritems is deprecated and will be removed in a future version. Use .items instead.


overflow encountered in long_scalars

Summarize dataset: 100%|██████████| 126/126 [00:31<00:00,  3.97it/s, Completed]                                      
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.14s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 83.82it/s]


In [53]:
df_satisfaction = pd.melt(df, id_vars='yearmonth', value_vars=satisfaction_codes) #wide to long and only keep satisfaction questions
df_satisfaction = df_satisfaction.groupby(['yearmonth', 'variable'])[['value']].mean().reset_index() #groupy monthyear and variable

In [59]:
fig = px.line(
    df_satisfaction,
    x="yearmonth",
    y="value",
    color="variable",
    color_discrete_sequence=color_discrete_sequence,
    template=template,
)

fig.update_layout(
    title="Zufriedenheit mit verschiedenen Aspekten im zeitlichen Verlauf",
    yaxis_title="",
    xaxis_title="",
    height=300,
    legend_title="",
)

fig.update_traces(line_width=2)
fig.update_yaxes(range=(65, 100))
fig.add_vrect(
    x0="2020-03-16",
    x1="2022-03-31",
    line_width=0,
    fillcolor="#0B1F26",
    opacity=0.05,
    annotation_text="Covid",
    annotation_position="top left",
    annotation_font_size=10,
    annotation_font_color="#0B1F26",
)

line_width = 2

fig.show()

pio.write_image(fig, "images/fig-satisfaction_zeitreihe.svg")  # save fig as png


### MetaData

In [63]:
# Pandas Profiling TextData
profile = ProfileReport(
    df[
        [
            "participant_id",
            "u_date",
            "S_alter",
            "S_sex",
            "S_wohnsitz",
            "u_klassencode",
            "u_ga",
            "S_AB3_HTA",
            "R_anschluss",
            "R_stoerung",
            "device_type",
            "dispcode",
            "u_ticket",
            "u_fahrausweis",
            "u_preis",
            "R_zweck",
            "ft_abfahrt",
            "ft_ankunft",
            "ft_startort_uic",
            "ft_tu",
            "ft_vm",
            "ft_vm_kurz",
            "ft_zielort_uic",
            "fg_abfahrt",
            "fg_ankunft",
            "fg_startort_uic",
            "fg_zielort_uic",
            "fg_startort",
            "fg_zielort",
            "ft_startort",
            "ft_zielort",
        ]
    ],
    title="MetaData",
    lazy=False,
    dark_mode=True,
)

profile.to_file("html/ProfilingMetaData.html")



iteritems is deprecated and will be removed in a future version. Use .items instead.


The input array could not be properly checked for nan values. nan values will be ignored.


The input array could not be properly checked for nan values. nan values will be ignored.


The input array could not be properly checked for nan values. nan values will be ignored.


The input array could not be properly checked for nan values. nan values will be ignored.


The input array could not be properly checked for nan values. nan values will be ignored.


The input array could not be properly checked for nan values. nan values will be ignored.


The input array could not be properly checked for nan values. nan values will be ignored.


There was an attempt to calculate the auto correlation, but this failed.
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/pandas-profiling/issue