In [1]:
import os
import csv
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML
from ipywidgets import interact
import time
import dask.dataframe as dd
import numpy as np
import folium
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

import


In [73]:
# Function to get the data from the csv files and return a dataframe
def csv_to_df(filepath):
    try:
        return pd.read_csv(filepath, sep=",")
    except pd.errors.ParserError:
        return pd.read_csv(filepath, sep=";")


passagierfrequenz_df = csv_to_df("../raw_data/passagierfrequenz.csv")
haltestelle_df = csv_to_df("../raw_data/haltestelle.csv")
haltepunkt_df = csv_to_df("../raw_data/haltepunkt.csv")

In [5]:
fahrzeiten_filenames = [
    filename
    for filename in os.listdir("../raw_data/fahrzeiten/")
    if filename.endswith(".csv")
]
fahrzeiten_head = pd.read_csv(
    "../raw_data/fahrzeiten/" + fahrzeiten_filenames[0], nrows=0
).columns


def get_fahrzeiten_dask_df(filename, columns=None):
    if columns is None:
        columns = fahrzeiten_head
    df = dd.read_csv("../raw_data/fahrzeiten/" + filename, usecols=columns)
    return df.compute()


fahrzeiten_dfs = {
    filename: get_fahrzeiten_dask_df(filename) for filename in fahrzeiten_filenames
}

In [6]:
# Create widgets
output = widgets.Output()
dropdown = widgets.Dropdown(
    options=fahrzeiten_filenames, layout=widgets.Layout(width="auto")
)
next_button = widgets.Button(description="Next")
prev_button = widgets.Button(description="Previous")
head_button = widgets.Button(description="Head")
tail_button = widgets.Button(description="Tail")
output = widgets.Output(layout=widgets.Layout(overflow_x="scroll"))

# Initialize index
index = [0]


# Event handlers for button clicks
def on_next_button_clicked(b):
    index[0] = (index[0] + 1) % len(fahrzeiten_filenames)
    dropdown.value = fahrzeiten_filenames[index[0]]


def on_prev_button_clicked(b):
    index[0] = (index[0] - 1) % len(fahrzeiten_filenames)
    dropdown.value = fahrzeiten_filenames[index[0]]


def on_head_button_clicked(b):
    with output:
        output.clear_output()
        display(fahrzeiten_dfs[dropdown.value].head())


def on_tail_button_clicked(b):
    with output:
        output.clear_output()
        display(fahrzeiten_dfs[dropdown.value].tail())


# Attach event handlers to buttons
next_button.on_click(on_next_button_clicked)
prev_button.on_click(on_prev_button_clicked)
head_button.on_click(on_head_button_clicked)
tail_button.on_click(on_tail_button_clicked)

title = widgets.HTML('<h2 style="text-align: center;">Fahrzeiten: SOLL und IST</h2>')
box_layout = widgets.Layout(display="flex", justify_content="center")
display(
    widgets.VBox(
        [
            title,
            widgets.HBox([dropdown], layout=box_layout),
            widgets.HBox(
                [head_button, prev_button, next_button, tail_button], layout=box_layout
            ),
            widgets.HBox([output], layout=box_layout),
        ],
        layout=box_layout,
    )
)

VBox(children=(HTML(value='<h2 style="text-align: center;">Fahrzeiten: SOLL und IST</h2>'), HBox(children=(Dro…

In [7]:
# Create a dictionary of your DataFrames
dfs = {
    "Haltepunkte_df": haltepunkt_df,
    "Haltestelle_df": haltestelle_df,
    "Passagierfrequenz_df": passagierfrequenz_df,
}
# Create widgets
output = widgets.Output()
dropdown = widgets.Dropdown(options=list(dfs.keys()))
head_button = widgets.Button(description="Head")
tail_button = widgets.Button(description="Tail")
describe_button = widgets.Button(description="Describe")
shape_button = widgets.Button(description="Shape")
isnull_button = widgets.Button(description="IsNull")
dtype_button = widgets.Button(description="Dtype")
output = widgets.Output()


# Event handlers for button clicks
def on_head_button_clicked(b):
    with output:
        output.clear_output()
        display(dfs[dropdown.value].head())


def on_tail_button_clicked(b):
    with output:
        output.clear_output()
        display(dfs[dropdown.value].tail())


def on_describe_button_clicked(b):
    with output:
        output.clear_output()
        display(dfs[dropdown.value].describe())


def on_shape_button_clicked(b):
    with output:
        output.clear_output()
        print(dfs[dropdown.value].shape)


def on_isnull_button_clicked(b):
    with output:
        output.clear_output()
        display(dfs[dropdown.value].isnull().sum())


def on_info_button_clicked(b):
    with output:
        output.clear_output()
        display(dfs[dropdown.value].info())


def on_dtype_button_clicked(b):
    with output:
        output.clear_output()
        display(dfs[dropdown.value].dtypes)


# Attach event handlers to buttons
head_button.on_click(on_head_button_clicked)
tail_button.on_click(on_tail_button_clicked)
describe_button.on_click(on_describe_button_clicked)
shape_button.on_click(on_shape_button_clicked)
isnull_button.on_click(on_isnull_button_clicked)


title = widgets.HTML('<h2 style="text-align: center;">Other CSVs</h2>')
box_layout = widgets.Layout(display="flex", justify_content="center")
display(
    widgets.VBox(
        [
            title,
            widgets.HBox([dropdown], layout=box_layout),
            widgets.HBox(
                [
                    head_button,
                    tail_button,
                    describe_button,
                    shape_button,
                    isnull_button,
                    dtype_button,
                ],
                layout=box_layout,
            ),
            widgets.HBox([output], layout=box_layout),
        ],
        layout=box_layout,
    )
)

VBox(children=(HTML(value='<h2 style="text-align: center;">Other CSVs</h2>'), HBox(children=(Dropdown(options=…

In [11]:
display(haltestelle_df.dtypes)
display(haltepunkt_df.dtypes)
display(passagierfrequenz_df.dtypes)

halt_id            int64
halt_diva          int64
halt_kurz         object
halt_lang         object
halt_ist_aktiv      bool
dtype: object

halt_punkt_id             int64
halt_punkt_diva           int64
halt_id                   int64
GPS_Latitude            float64
GPS_Longitude           float64
GPS_Bearing             float64
halt_punkt_ist_aktiv       bool
dtype: object

code_codice               object
uic                      float64
bahnhof_gare_stazione     object
kt_ct_cantone             object
isb_gi                    object
jahr_annee_anno          float64
dtv_tjm_tgm              float64
dwv_tmjo_tfm             float64
dnwv_tmjno_tmgnl         float64
evu_ef_itf                object
bemerkungen               object
remarques                 object
note                      object
remarks                   object
geopos                    object
lod                       object
dtype: object

In [12]:
bemerkungen_df = passagierfrequenz_df["bemerkungen"].dropna()
remarks_df = passagierfrequenz_df["remarques"].dropna()
note_df = passagierfrequenz_df["note"].dropna()

display(bemerkungen_df)
display(remarks_df)
display(note_df)

2                                               Ohne TPC.
3                                               Ohne TPC.
6           Weniger als 50 Ein- und Aussteigende pro Tag.
18      Weniger als 50 Ein- und Aussteigende pro Nicht...
19          Weniger als 50 Ein- und Aussteigende pro Tag.
                              ...                        
2289                           Inklusive Bahnverkehr MVR.
2294        Weniger als 50 Ein- und Aussteigende pro Tag.
2296        Weniger als 50 Ein- und Aussteigende pro Tag.
2302                                            Ohne AVA.
2304    Durchschnittswert 2018 durch Streckensperrung ...
Name: bemerkungen, Length: 387, dtype: object

2                                               Sans TPC.
3                                               Sans TPC.
6                         Moins de 50 personnes par jour.
18              Moins de 50 personnes par jour non ouvré.
19                        Moins de 50 personnes par jour.
                              ...                        
2289                                   Trains MVR inclus.
2294                      Moins de 50 personnes par jour.
2296                      Moins de 50 personnes par jour.
2302                                            Sans AVA.
2304    Valeur moyenne 2018 à la baisse en raison de l...
Name: remarques, Length: 387, dtype: object

2                                              Senza TPC.
3                                              Senza TPC.
6                           Meno di 50 persone al giorno.
18           Meno di 50 persone al giorno non lavorativo.
19                          Meno di 50 persone al giorno.
                              ...                        
2289                                     Incl. treni MVR.
2294                        Meno di 50 persone al giorno.
2296                        Meno di 50 persone al giorno.
2302                                           Senza AVA.
2304    Valore medio 2018 compromesso dallo sbarrament...
Name: note, Length: 387, dtype: object

In [13]:
# Create widgets
output2 = widgets.Output()
dropdown2 = widgets.Dropdown(options=fahrzeiten_filenames)
describe_button = widgets.Button(description="Describe")
shape_button = widgets.Button(description="Shape")
isnull_button = widgets.Button(description="Is Null")
dtype_button = widgets.Button(description="Dtype")


# Event handlers for button clicks
def on_describe_button_clicked(b):
    with output2:
        output2.clear_output()
        display(fahrzeiten_dfs[dropdown2.value].describe())


def on_shape_button_clicked(b):
    with output2:
        output2.clear_output()
        print(fahrzeiten_dfs[dropdown2.value].shape)


def on_isnull_button_clicked(b):
    with output2:
        output2.clear_output()
        display(fahrzeiten_dfs[dropdown2.value].isnull().sum())


def on_dtype_button_clicked(b):
    with output2:
        output2.clear_output()
        if dropdown2.value in fahrzeiten_dfs:
            if not fahrzeiten_dfs[dropdown2.value].empty:
                display(fahrzeiten_dfs[dropdown2.value].dtypes)
            else:
                print("DataFrame is empty.")
        else:
            print("Key not found in dictionary.")


# Attach event handlers to buttons
describe_button.on_click(on_describe_button_clicked)
shape_button.on_click(on_shape_button_clicked)
isnull_button.on_click(on_isnull_button_clicked)

# Display widgets
title = widgets.HTML('<h2 style="text-align: center;">Fahrzeiten: SOLL und IST</h2>')
box_layout = widgets.Layout(display="flex", justify_content="center")
display(
    widgets.VBox(
        [
            title,
            widgets.HBox([dropdown2], layout=box_layout),
            widgets.HBox(
                [describe_button, shape_button, isnull_button, dtype_button],
                layout=box_layout,
            ),
            widgets.HBox([output2], layout=box_layout),
        ],
        layout=box_layout,
    )
)

VBox(children=(HTML(value='<h2 style="text-align: center;">Fahrzeiten: SOLL und IST</h2>'), HBox(children=(Dro…

In [16]:
bahnhofe = passagierfrequenz_df["bahnhof_gare_stazione"].unique()

print(f"The unique 'bahnhof_gare_stazione' in the DataFrame are: {list(bahnhofe)}")

The unique 'bahnhof_gare_stazione' in the DataFrame are: ['Aathal', 'Aarburg-Oftringen', 'Aigle', 'Altstätten SG', 'Altendorf', 'Alvaneu', 'Arnegg', 'Arosa', 'Bäch', 'Boudry', 'Bernina Diavolezza', 'Bellinzona', 'Bellach', 'Belfaux CFF', 'Brügg BE', 'Biasca', 'Biel/Bienne Bözingenfeld/Champ', 'Bigenthal', 'Biberegg', 'Biel Mett', 'Birr', 'Biberist Ost', 'Blitzingen', 'Bern Stöckacker', 'Benzenschwil', 'Bôle', 'Boltigen', 'Boswil-Bünzen', 'Bouveret', 'Bowil', 'Belp Steinbach', 'Brusio', 'Brittnau-Wikon', 'Brunnadern-Neckertal', 'Brunnen', 'Brünig-Hasliberg', 'Basel SBB', 'Balsthal', 'Buchs SG', 'Bubikon', 'Buchrain', 'Beinwil am See', 'Buix', 'Bussigny', 'Bischofszell Stadt', 'Campascio', 'Castione-Arbedo', 'Cazis', 'Concise', 'Courchavon', 'Cadenazzo', 'Cavadürli', 'Montreux-Collège', 'Châtelard VD', 'Les Charbonnières', 'Chénens', 'Chernex', 'Châteauneuf-Conthey', 'Chiasso', 'La Chiésaz', "Château-d'Hauteville", 'Chur West', 'Cinuos-chel-Brail', 'Colombier', 'Colombier NE', 'Corgémont

In [17]:
betreiber = passagierfrequenz_df["isb_gi"].unique()

print(f"The unique 'isb_gi' in the DataFrame are: {list(betreiber)}")

The unique 'isb_gi' in the DataFrame are: ['SBB', 'RhB', 'BLS', 'SOB', 'MGB', 'ZB', 'OeBB', 'MOB', 'TRAVYS', 'MVR', 'BOB', 'STB']


In [22]:
kantone = passagierfrequenz_df["kt_ct_cantone"].unique()

print(f"The unique 'kt_ct_cantone' in the DataFrame are: {list(kantone)}")
print(f"The number of unique 'kt_ct_cantone' in the DataFrame are: {len(kantone)}")

The unique 'kt_ct_cantone' in the DataFrame are: ['ZH', 'AG', 'VD', 'SG', 'SZ', 'GR', 'NE', 'TI', 'SO', 'FR', 'BE', 'VS', 'LU', 'BS', 'JU', 'TG', 'GE', 'GL', 'BL', 'OW', 'UR', 'NW', 'ZG', 'Ausland', 'SH', 'AR']
The number of unique 'kt_ct_cantone' in the DataFrame are: 26


In [23]:
test_df = passagierfrequenz_df.sort_values(by="dtv_tjm_tgm", ascending=False).head(10)

display(test_df)

Unnamed: 0,code_codice,uic,bahnhof_gare_stazione,kt_ct_cantone,isb_gi,jahr_annee_anno,dtv_tjm_tgm,dwv_tmjo_tfm,dnwv_tmjno_tmgnl,evu_ef_itf,bemerkungen,remarques,note,remarks,geopos,lod
902,ZUE,8503000.0,Zürich HB,ZH,SBB,,423600.0,471300.0,317500.0,"SBB, SZU, Thurbo",,,,,"47.378176674223226, 8.540212349099065",http://lod.opentransportdata.swiss/didok/8503000
903,ZUE,8503000.0,Zürich HB,ZH,SBB,2022.0,345300.0,360900.0,308700.0,"SBB, SOB, Thurbo",Umfasst auch ZLOE und ZMUS; ohne SZU.,Y compris ZLOE et ZMUS; sans SZU.,Comprende anche ZLOE e ZMUS; senza SZU.,Includes ZLOE and ZMUS; without SZU.,"47.378176674223226, 8.540212349099065",http://lod.opentransportdata.swiss/didok/8503000
952,BN,8507000.0,Bern,BE,SBB,2018.0,184000.0,206400.0,134100.0,"BLS, SBB",Ohne RBS und TPF.,Sans RBS et TPF.,Senza RBS e TPF.,Without RBS and TPF.,"46.948832290498416, 7.439130889923935",http://lod.opentransportdata.swiss/didok/8507000
1914,BN,8507000.0,Bern,BE,SBB,2022.0,155200.0,164800.0,132800.0,"BLS, SBB, SOB, TPF",Ohne RBS.,Sans RBS.,Senza RBS.,Without RBS.,"46.948832290498416, 7.439130889923935",http://lod.opentransportdata.swiss/didok/8507000
1462,BS,8500010.0,Basel SBB,BS,SBB,2018.0,99800.0,111000.0,75000.0,"SBB, SBB GmbH",Einsteigende in Richtung Ausland und Aussteige...,Les personnes montant et descendant des trains...,Sono considerati solo in parte i passeggeri da...,Passengers boarding and alighting to/from abro...,"47.5474120550501, 7.589562790156525",http://lod.opentransportdata.swiss/didok/8500010
2290,W,8506000.0,Winterthur,ZH,SBB,2018.0,95100.0,109300.0,63600.0,"SBB, Thurbo",,,,,"47.500333810466714, 8.72382105540869",http://lod.opentransportdata.swiss/didok/8506000
674,LZ,8505000.0,Luzern,LU,SBB,2018.0,90800.0,97900.0,75300.0,"BLS, SBB, SOB, Zentralbahn",,,,,"47.0501778280856, 8.31018320694279",http://lod.opentransportdata.swiss/didok/8505000
1645,LS,8501120.0,Lausanne,VD,SBB,2018.0,89200.0,102500.0,59800.0,SBB,Einsteigende in Richtung Ausland und Aussteige...,Les personnes montant et descendant des trains...,Sono considerati solo in parte i passeggeri da...,Passengers boarding and alighting to/from abro...,"46.51679183546494, 6.629092303198574",http://lod.opentransportdata.swiss/didok/8501120
665,LS,8501120.0,Lausanne,VD,SBB,2022.0,86400.0,93700.0,69300.0,SBB,Einsteigende in Richtung Ausland und Aussteige...,Les personnes montant et descendant des trains...,Sono considerati solo in parte i passeggeri da...,Passengers boarding and alighting to/from abro...,"46.51679183546494, 6.629092303198574",http://lod.opentransportdata.swiss/didok/8501120
38,BS,8500010.0,Basel SBB,BS,SBB,2022.0,85600.0,90900.0,73100.0,"SBB, SBB GmbH, SOB",Einsteigende in Richtung Ausland und Aussteige...,Les personnes montant et descendant des trains...,Sono considerati solo in parte i passeggeri da...,Passengers boarding and alighting to/from abro...,"47.5474120550501, 7.589562790156525",http://lod.opentransportdata.swiss/didok/8500010


In [38]:
def create_bar_chart_avg_passenger_count_kantone(passagierfrequenz_df, save=False):
    grouped = (
        passagierfrequenz_df.groupby(["jahr_annee_anno", "kt_ct_cantone"])[
            "dtv_tjm_tgm"
        ]
        .sum()
        .unstack()
    )

    # Sort the grouped DataFrame by the sum of 'dtv_tjm_tgm' and select the top 10 'kt_ct_cantone'
    top_10_kantone = grouped.sum().sort_values(ascending=False).head(10).index
    grouped = grouped[top_10_kantone]

    # Convert the DataFrame to a format that can be used with seaborn
    df_grouped = pd.melt(
        grouped.reset_index(), id_vars="jahr_annee_anno", value_vars=top_10_kantone
    )

    # Convert 'jahr_annee_anno' to string
    df_grouped["jahr_annee_anno"] = df_grouped["jahr_annee_anno"].astype(str)

    # Create the horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(
        x="value", y="kt_ct_cantone", hue="jahr_annee_anno", data=df_grouped, ax=ax
    )
    ax.xaxis.set_major_formatter(
        ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x))
    )
    plt.title("Average passenger count per Kantone for each year (Top 10 Kantone)")
    plt.xlabel("Average Passengers count")
    plt.ylabel("Kantone")
    plt.grid(axis="x")
    plt.xticks(
        np.arange(0, df_grouped["value"].max() + 1, df_grouped["value"].max() // 10)
    )
    plt.legend()

    if save:
        plt.savefig(
            "../figures/exploration/avg_passenger_count_kantone.png",
            bbox_inches="tight",
        )
        plt.close()

In [80]:
create_bar_chart_avg_passenger_count_kantone(passagierfrequenz_df, save=True)

NameError: name 'create_bar_chart_avg_passenger_count_kantone' is not defined

In [40]:
def create_bar_chart_avg_passenger_count_kantone_workweek(
    passagierfrequenz_df, save=False
):
    grouped = (
        passagierfrequenz_df.groupby(["jahr_annee_anno", "kt_ct_cantone"])[
            "dwv_tmjo_tfm"
        ]
        .sum()
        .unstack()
    )

    # Sort the grouped DataFrame by the sum of 'dwv_tmjo_tfm' and select the top 10 'kt_ct_cantone'
    top_10_kantone = grouped.sum().sort_values(ascending=False).head(10).index
    grouped = grouped[top_10_kantone]

    # Convert the DataFrame to a format that can be used with seaborn
    df_grouped = pd.melt(
        grouped.reset_index(), id_vars="jahr_annee_anno", value_vars=top_10_kantone
    )

    # Convert 'jahr_annee_anno' to string
    df_grouped["jahr_annee_anno"] = df_grouped["jahr_annee_anno"].astype(str)

    # Create the horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(
        x="value", y="kt_ct_cantone", hue="jahr_annee_anno", data=df_grouped, ax=ax
    )
    ax.xaxis.set_major_formatter(
        ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x))
    )
    plt.title(
        "Average passenger count per Kantone for each year. Work week (Top 10 Kantone)"
    )
    plt.xlabel("Average Passengers count")
    plt.ylabel("Kantone")
    plt.grid(axis="x")
    plt.xticks(
        np.arange(0, df_grouped["value"].max() + 1, df_grouped["value"].max() // 10)
    )
    plt.legend()

    if save:
        plt.savefig(
            "../figures/exploration/avg_passenger_count_kantone_workweek.png",
            bbox_inches="tight",
        )
        plt.close()

In [41]:
create_bar_chart_avg_passenger_count_kantone_workweek(passagierfrequenz_df, save=True)

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


In [42]:
def create_bar_chart_avg_passenger_count_kantone_nonworkdays(
    passagierfrequenz_df, save=False
):
    grouped = (
        passagierfrequenz_df.groupby(["jahr_annee_anno", "kt_ct_cantone"])[
            "dnwv_tmjno_tmgnl"
        ]
        .sum()
        .unstack()
    )

    # Sort the grouped DataFrame by the sum of 'dnwv_tmjno_tmgnl' and select the top 10 'kt_ct_cantone'
    top_10_kantone = grouped.sum().sort_values(ascending=False).head(10).index
    grouped = grouped[top_10_kantone]

    # Convert the DataFrame to a format that can be used with seaborn
    df_grouped = pd.melt(
        grouped.reset_index(), id_vars="jahr_annee_anno", value_vars=top_10_kantone
    )

    # Convert 'jahr_annee_anno' to string
    df_grouped["jahr_annee_anno"] = df_grouped["jahr_annee_anno"].astype(str)

    # Create the horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(
        x="value", y="kt_ct_cantone", hue="jahr_annee_anno", data=df_grouped, ax=ax
    )
    ax.xaxis.set_major_formatter(
        ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x))
    )
    plt.title(
        "Average passenger count per Kantone for each year, non-work days (Top 10 Kantone)"
    )
    plt.xlabel("Average Passengers count")
    plt.ylabel("Kantone")
    plt.grid(axis="x")
    plt.xticks(
        np.arange(0, df_grouped["value"].max() + 1, df_grouped["value"].max() // 10)
    )
    plt.legend()

    if save:
        plt.savefig(
            "../figures/exploration/avg_passenger_count_kantone_nonworkdays.png",
            bbox_inches="tight",
        )
        plt.close()

In [52]:
def create_bar_chart_bahnhofseigner(passagierfrequenz_df, save=False, values=10):
    counts = passagierfrequenz_df["evu_ef_itf"].value_counts()

    if values > len(counts):
        values = len(counts)

    counts = counts.head(values)

    # Create the bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(x=counts.index, y=counts.values, ax=ax)
    ax.yaxis.set_major_formatter(
        ticker.FuncFormatter(lambda x, pos: "{:,.0f}".format(x))
    )
    plt.title("Top {values} Bahnhofseigner")
    plt.xlabel("Bahnhofseigner")
    plt.ylabel("Number of Bahnhöfe")
    plt.grid(axis="y")

    if save:
        plt.savefig(
            "../figures/exploration/bahnhofseigner_count.png", bbox_inches="tight"
        )
        plt.close()

In [53]:
create_bar_chart_bahnhofseigner(passagierfrequenz_df, save=True)

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


Missing Data


In [None]:
display(fahrzeiten_dfs["Fahrzeiten_SOLL_IST_20220102_20220108.csv"].dtypes)

linie                    int64
richtung                 int64
betriebsdatum           object
fahrzeug                 int64
kurs                     int64
seq_von                  int64
halt_diva_von            int64
halt_punkt_diva_von      int64
halt_kurz_von1          object
datum_von               object
soll_an_von              int64
ist_an_von               int64
soll_ab_von              int64
ist_ab_von               int64
seq_nach                 int64
halt_diva_nach           int64
halt_punkt_diva_nach     int64
halt_kurz_nach1         object
datum_nach              object
soll_an_nach             int64
ist_an_nach1             int64
soll_ab_nach             int64
ist_ab_nach              int64
fahrt_id                 int64
fahrweg_id               int64
fw_no                    int64
fw_typ                   int64
fw_kurz                  int64
fw_lang                 object
umlauf_von               int64
halt_id_von              int64
halt_id_nach             int64
halt_pun

In [None]:
display(fahrzeiten_dfs["Fahrzeiten_SOLL_IST_20220102_20220108.csv"].columns)

Index(['linie', 'richtung', 'betriebsdatum', 'fahrzeug', 'kurs', 'seq_von',
       'halt_diva_von', 'halt_punkt_diva_von', 'halt_kurz_von1', 'datum_von',
       'soll_an_von', 'ist_an_von', 'soll_ab_von', 'ist_ab_von', 'seq_nach',
       'halt_diva_nach', 'halt_punkt_diva_nach', 'halt_kurz_nach1',
       'datum_nach', 'soll_an_nach', 'ist_an_nach1', 'soll_ab_nach',
       'ist_ab_nach', 'fahrt_id', 'fahrweg_id', 'fw_no', 'fw_typ', 'fw_kurz',
       'fw_lang', 'umlauf_von', 'halt_id_von', 'halt_id_nach',
       'halt_punkt_id_von', 'halt_punkt_id_nach'],
      dtype='object')

In [None]:
selected_columns_df = fahrzeiten_dfs["Fahrzeiten_SOLL_IST_20220102_20220108.csv"][
    [
        "linie",
        "halt_id_von",
        "halt_punkt_id_von",
        "halt_kurz_von1",
        "seq_von",
        "seq_nach",
    ]
]

display(selected_columns_df.columns)

Index(['linie', 'halt_id_von', 'halt_punkt_id_von', 'halt_kurz_von1',
       'seq_von', 'seq_nach'],
      dtype='object')

In [None]:
all_haltestellen_df = pd.merge(haltestelle_df, haltepunkt_df, on="halt_id", how="outer")

display(all_haltestellen_df)

Unnamed: 0,halt_id,halt_diva,halt_kurz,halt_lang,halt_ist_aktiv,halt_punkt_id,halt_punkt_diva,GPS_Latitude,GPS_Longitude,GPS_Bearing,halt_punkt_ist_aktiv
0,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,303,51,47.360017,8.456337,85.0,False
1,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,304,50,47.360153,8.456180,270.0,False
2,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,10982,50,47.360153,8.456180,266.0,False
3,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,13469,51,47.360035,8.456297,85.0,False
4,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,13485,50,47.360154,8.456141,266.0,False
...,...,...,...,...,...,...,...,...,...,...,...
19686,3338,7070,OBER,"Zürich, Obere Hornhalde",True,54979,51,47.331647,8.538054,328.0,True
19687,3339,7065,OGAS,"Oetwil an der Limmat, Gässliacker",True,54811,51,47.429765,8.392162,315.0,True
19688,3339,7065,OGAS,"Oetwil an der Limmat, Gässliacker",True,54820,50,47.429647,8.392346,135.0,True
19689,3360,6923,ZNIE,"Zollikon, Niederhofenrain",True,54815,51,47.347797,8.576690,270.0,True


In [None]:
# Convert pandas DataFrames to Dask DataFrames
selected_columns_dd = dd.from_pandas(selected_columns_df, npartitions=10)
haltestelle_dd = dd.from_pandas(haltestelle_df, npartitions=10)
haltepunkt_dd = dd.from_pandas(haltepunkt_df, npartitions=10)

# Perform the merge operation
merged_dd = selected_columns_dd.merge(
    haltestelle_dd, left_on="halt_id_von", right_on="halt_id"
)
final_dd = merged_dd.merge(haltepunkt_dd, left_on="halt_id_von", right_on="halt_id")

# Convert Dask DataFrame back to pandas DataFrame
final_df = final_dd.compute()

# Select only the necessary columns
bus_stops_for_each_line_df = final_df[
    [
        "linie",
        "halt_id_von",
        "halt_diva",
        "halt_kurz",
        "GPS_Latitude",
        "GPS_Longitude",
        "seq_von",
        "seq_nach",
    ]
]

In [None]:
display(bus_stops_for_each_line_df)

bus_stops_for_each_line_df.sort_values(by=["linie"], inplace=True)

display(bus_stops_for_each_line_df)

Unnamed: 0,linie,halt_id_von,halt_diva,halt_kurz,GPS_Latitude,GPS_Longitude,seq_von,seq_nach
0,32,2702,498,BIRD,47.407217,8.530893,7,8
1,32,2702,498,BIRD,47.407879,8.530364,7,8
2,32,2702,498,BIRD,47.407882,8.530365,7,8
3,32,2702,498,BIRD,47.407220,8.530895,7,8
4,32,2702,498,BIRD,47.407879,8.530364,7,8
...,...,...,...,...,...,...,...,...
6641128,162,1911,75,ALZE,47.320015,8.540657,6,7
6641129,162,1911,75,ALZE,47.319752,8.540890,6,7
6641130,162,1911,75,ALZE,47.319755,8.540891,6,7
6641131,162,1911,75,ALZE,47.320018,8.540658,6,7


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bus_stops_for_each_line_df.sort_values(by=["linie"], inplace=True)


Unnamed: 0,linie,halt_id_von,halt_diva,halt_kurz,GPS_Latitude,GPS_Longitude,seq_von,seq_nach
966180,2,1565,440,BELL,47.367749,8.546028,25,26
9772708,2,2245,1548,LETG,47.380951,8.502915,14,15
9772707,2,2245,1548,LETG,47.380863,8.504768,14,15
9772706,2,2245,1548,LETG,47.381626,8.503909,14,15
9772705,2,2245,1548,LETG,47.380951,8.502915,14,15
...,...,...,...,...,...,...,...,...
5286335,919,2780,2038,REBW,47.326670,8.598042,3,4
5286334,919,2780,2038,REBW,47.326472,8.598104,3,4
5286333,919,2780,2038,REBW,47.326667,8.598041,3,4
5286331,919,2780,2038,REBW,47.326472,8.598104,3,4


TraitError: Invalid selection: value not found

Unnamed: 0,linie,halt_id_von,halt_diva,halt_kurz,GPS_Latitude,GPS_Longitude,seq_von,seq_nach
0,32,2702,498,BIRD,47.407217,8.530893,7,8
1,32,2702,498,BIRD,47.407879,8.530364,7,8
2,32,2702,498,BIRD,47.407879,8.530364,7,8
3,32,2702,498,BIRD,47.407882,8.530365,7,8
4,32,2702,498,BIRD,47.407220,8.530895,7,8
...,...,...,...,...,...,...,...,...
6641128,162,1911,75,ALZE,47.319755,8.540891,5,6
6641129,162,1911,75,ALZE,47.320018,8.540658,5,6
6641130,162,1911,75,ALZE,47.319755,8.540891,5,6
6641131,162,1911,75,ALZE,47.320018,8.540658,5,6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bus_stops_for_each_line_df.sort_values(by=["linie"], inplace=True)


Unnamed: 0,linie,halt_id_von,halt_diva,halt_kurz,GPS_Latitude,GPS_Longitude,seq_von,seq_nach
5485294,2,2505,935,GASO,47.397819,8.460252,28,29
8143223,2,1535,48,ALBP,47.378092,8.509612,12,13
8143222,2,1535,48,ALBP,47.378384,8.510094,12,13
8143221,2,1535,48,ALBP,47.380041,8.507321,12,13
8143220,2,1535,48,ALBP,47.380013,8.507467,12,13
...,...,...,...,...,...,...,...,...
9696538,919,2864,2167,SIHG,47.321455,8.586683,5,6
9696537,919,2864,2167,SIHG,47.321458,8.586684,5,6
9696536,919,2864,2167,SIHG,47.321649,8.586503,5,6
9696534,919,2864,2167,SIHG,47.321458,8.586684,5,6


In [None]:
# Erstellen Sie eine Karte mit einem Startpunkt (z. B. den geografischen Koordinaten einer Stadt).
m = folium.Map(location=[47, 8], zoom_start=10)

# Erhalten Sie die eindeutigen Werte in der "geopos" Spalte
unique_geopos = passagierfrequenz_df["geopos"].unique()

# Filter out the wrong values
unique_geopos = [geopos for geopos in unique_geopos if type(geopos) == str]

# Fügen Sie Marker für jede eindeutige geopos hinzu
# for geopos in unique_geopos:
#     # Teilen Sie den geopos String in latitude und longitude
#     latitude, longitude = geopos.split(',')

#     folium.Marker(
#         location=[latitude, longitude],
#         icon=folium.Icon(color="red")
#     ).add_to(m)


# haltepunkte_df is a dataframe with all rows from haltepunkt_df where GPS_Latitude and GPS_Longitude are not NaN
haltepunkte_df = haltepunkt_df[
    haltepunkt_df["GPS_Latitude"].notna() & haltepunkt_df["GPS_Longitude"].notna()
]

# Nur die Zeilen, wo eine Unique Komination von GPS_Latitude und GPS_Longitude existiert
haltepunkte_df = haltepunkte_df.drop_duplicates(
    subset=["GPS_Latitude", "GPS_Longitude"]
)

haltepunkte_df = haltepunkte_df.head(10)

locations = []

for index, row in haltepunkte_df.iterrows():
    location = [row["GPS_Latitude"], row["GPS_Longitude"]]
    locations.append(location)
    folium.Marker(location=location, icon=folium.Icon(color="blue")).add_to(m)

# Verbinden Sie die Haltepunkte mit Linien
folium.PolyLine(locations, color="red", weight=2.5, opacity=1).add_to(m)


# folium.TileLayer('Mapbox Bright').add_to(my_map)

<folium.vector_layers.PolyLine at 0x2729eb04310>

Basic Analysis


In [None]:
display(fahrzeiten_dfs["Fahrzeiten_SOLL_IST_20220102_20220108.csv"].dtypes)

linie                    int64
richtung                 int64
betriebsdatum           object
fahrzeug                 int64
kurs                     int64
seq_von                  int64
halt_diva_von            int64
halt_punkt_diva_von      int64
halt_kurz_von1          object
datum_von               object
soll_an_von              int64
ist_an_von               int64
soll_ab_von              int64
ist_ab_von               int64
seq_nach                 int64
halt_diva_nach           int64
halt_punkt_diva_nach     int64
halt_kurz_nach1         object
datum_nach              object
soll_an_nach             int64
ist_an_nach1             int64
soll_ab_nach             int64
ist_ab_nach              int64
fahrt_id                 int64
fahrweg_id               int64
fw_no                    int64
fw_typ                   int64
fw_kurz                  int64
fw_lang                 object
umlauf_von               int64
halt_id_von              int64
halt_id_nach             int64
halt_pun

In [None]:
display(fahrzeiten_dfs["Fahrzeiten_SOLL_IST_20220102_20220108.csv"].columns)

Index(['linie', 'richtung', 'betriebsdatum', 'fahrzeug', 'kurs', 'seq_von',
       'halt_diva_von', 'halt_punkt_diva_von', 'halt_kurz_von1', 'datum_von',
       'soll_an_von', 'ist_an_von', 'soll_ab_von', 'ist_ab_von', 'seq_nach',
       'halt_diva_nach', 'halt_punkt_diva_nach', 'halt_kurz_nach1',
       'datum_nach', 'soll_an_nach', 'ist_an_nach1', 'soll_ab_nach',
       'ist_ab_nach', 'fahrt_id', 'fahrweg_id', 'fw_no', 'fw_typ', 'fw_kurz',
       'fw_lang', 'umlauf_von', 'halt_id_von', 'halt_id_nach',
       'halt_punkt_id_von', 'halt_punkt_id_nach'],
      dtype='object')

In [None]:
selected_columns_df = fahrzeiten_dfs["Fahrzeiten_SOLL_IST_20220102_20220108.csv"][
    [
        "linie",
        "richtung",
        "halt_id_von",
        "halt_punkt_id_von",
        "halt_kurz_von1",
        "seq_von",
        "seq_nach",
    ]
]

display(selected_columns_df.columns)

Index(['linie', 'richtung', 'halt_id_von', 'halt_punkt_id_von',
       'halt_kurz_von1', 'seq_von', 'seq_nach'],
      dtype='object')

In [None]:
all_haltestellen_df = pd.merge(haltestelle_df, haltepunkt_df, on="halt_id", how="outer")

display(all_haltestellen_df)

Unnamed: 0,halt_id,halt_diva,halt_kurz,halt_lang,halt_ist_aktiv,halt_punkt_id,halt_punkt_diva,GPS_Latitude,GPS_Longitude,GPS_Bearing,halt_punkt_ist_aktiv
0,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,303,51,47.360017,8.456337,85.0,False
1,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,304,50,47.360153,8.456180,270.0,False
2,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,10982,50,47.360153,8.456180,266.0,False
3,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,13469,51,47.360035,8.456297,85.0,False
4,143,2570,BirWSL,"Birmensdorf ZH, Sternen/WSL",True,13485,50,47.360154,8.456141,266.0,False
...,...,...,...,...,...,...,...,...,...,...,...
19686,3338,7070,OBER,"Zürich, Obere Hornhalde",True,54979,51,47.331647,8.538054,328.0,True
19687,3339,7065,OGAS,"Oetwil an der Limmat, Gässliacker",True,54811,51,47.429765,8.392162,315.0,True
19688,3339,7065,OGAS,"Oetwil an der Limmat, Gässliacker",True,54820,50,47.429647,8.392346,135.0,True
19689,3360,6923,ZNIE,"Zollikon, Niederhofenrain",True,54815,51,47.347797,8.576690,270.0,True


In [None]:
# Convert pandas DataFrames to Dask DataFrames
selected_columns_dd = dd.from_pandas(selected_columns_df, npartitions=10)
haltestelle_dd = dd.from_pandas(haltestelle_df, npartitions=10)
haltepunkt_dd = dd.from_pandas(haltepunkt_df, npartitions=10)

# Perform the merge operation
merged_dd = selected_columns_dd.merge(
    haltestelle_dd, left_on="halt_id_von", right_on="halt_id"
)
final_dd = merged_dd.merge(haltepunkt_dd, left_on="halt_id_von", right_on="halt_id")

# Convert Dask DataFrame back to pandas DataFrame
final_df = final_dd.compute()

# Select only the necessary columns
bus_stops_for_each_line_df = final_df[
    [
        "linie",
        "richtung",
        "halt_id_von",
        "halt_diva",
        "halt_kurz",
        "GPS_Latitude",
        "GPS_Longitude",
        "seq_von",
        "seq_nach",
    ]
]

In [None]:
display(bus_stops_for_each_line_df)


bus_stops_for_each_line_df = bus_stops_for_each_line_df.dropna(subset=["halt_diva"])

bus_stops_for_each_line_df = bus_stops_for_each_line_df.dropna(
    subset=["GPS_Latitude", "GPS_Longitude"]
)

bus_stops_for_each_line_df.sort_values(by=["linie"], inplace=True)


display(bus_stops_for_each_line_df)

Unnamed: 0,linie,richtung,halt_id_von,halt_diva,halt_kurz,GPS_Latitude,GPS_Longitude,seq_von,seq_nach
0,32,1,2756,3029,HSBB,47.386308,8.517975,13,14
1,32,1,2756,3029,HSBB,,,13,14
2,32,1,2756,3029,HSBB,47.385184,8.516973,13,14
3,32,1,2756,3029,HSBB,47.386570,8.518749,13,14
4,32,1,2756,3029,HSBB,47.385343,8.517267,13,14
...,...,...,...,...,...,...,...,...,...
6641128,162,2,1911,75,ALZE,47.320015,8.540657,5,6
6641129,162,2,1911,75,ALZE,47.319752,8.540890,5,6
6641130,162,2,1911,75,ALZE,47.319755,8.540891,5,6
6641131,162,2,1911,75,ALZE,47.320018,8.540658,5,6


Unnamed: 0,linie,richtung,halt_id_von,halt_diva,halt_kurz,GPS_Latitude,GPS_Longitude,seq_von,seq_nach
1646182,2,2,1565,440,BELL,47.366942,8.545467,8,9
1545068,2,1,1565,440,BELL,47.366453,8.546133,25,26
1545067,2,1,1565,440,BELL,47.366697,8.546006,25,26
1545066,2,1,1565,440,BELL,47.367985,8.544854,25,26
1545065,2,1,1565,440,BELL,47.366582,8.544904,25,26
...,...,...,...,...,...,...,...,...,...
5382653,919,1,2096,2826,WALT,47.336093,8.617990,12,13
5382652,919,1,2096,2826,WALT,47.336109,8.618149,12,13
5382651,919,1,2096,2826,WALT,47.336090,8.617988,12,13
5382621,919,1,2096,2826,WALT,47.336090,8.617988,12,13


In [None]:
# Dropdown widget with unique line names
dropdown = widgets.Dropdown(
    options=sorted(
        bus_stops_for_each_line_df["linie"].unique()
    ),  # sort the unique line names
    description="Lines:",
)

# Button widget
button = widgets.Button(description="Show Map")


# Function to be called when the button is clicked
def on_button_clicked(b):
    # Filter DataFrame for selected line
    df = bus_stops_for_each_line_df[
        bus_stops_for_each_line_df["linie"] == dropdown.value
    ]  # replace 'linie' with your column name

    # Sort the DataFrame by 'sq_von'
    df = df.sort_values(by="seq_von")

    display(df)

    # Create a map centered at the mean latitude and longitude values
    m = folium.Map(
        location=[df["GPS_Latitude"].mean(), df["GPS_Longitude"].mean()], zoom_start=13
    )

    # Add a marker for each bus stop
    locations = []
    for _, row in df.iterrows():
        location = [row["GPS_Latitude"], row["GPS_Longitude"]]
        locations.append(location)
        folium.Marker(location).add_to(m)

    # Connect the markers with line
    folium.PolyLine(locations, color="red", weight=2.5, opacity=1).add_to(m)

    # Display map
    display(m)


# Link button
button.on_click(on_button_clicked)

# Display widgets
display(dropdown, button)

Dropdown(description='Lines:', options=(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 31, 32, 33, 35, 37, 38…

Button(description='Show Map', style=ButtonStyle())

TO DO:

- fundamentals für Haltepunkt, Haltestelle und Passagierf

- Geojson in thing
