In [1]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Load data
DATA_PATH = "/home/learner/Desktop/internship/Dealy_analysis/tempocom-app/df_monthly_with_headers.csv"
df_station = pd.read_csv(DATA_PATH)
df_station["Stopping place (FR)"] = df_station["Stopping place (FR)"].astype(str).str.title()

# Station selection
station_options = sorted(df_station["Stopping place (FR)"].dropna().unique())
default_station = [s for s in ["Brussel-Noord", "Bruxelles-Midi"] if s in station_options]

station_selector = widgets.SelectMultiple(
    options=station_options,
    value=default_station or station_options[:1],
    description='Stations:',
    rows=10,
    layout=widgets.Layout(width='50%')
)

output_station = widgets.Output()

def show_station_chart(stations):
    """
    Display a line chart of total train delays by hour for selected stations.

    Parameters
    ----------
    stations : list of str
        List of station names (in French) to filter and visualize.

    Functionality
    -------------
    - Filters the DataFrame for the selected stations.
    - Computes total delay as the sum of 'Delay at departure' and 'Delay at arrival'.
    - Drops records with zero total delay.
    - Extracts the hour from the actual departure or arrival time.
    - Groups by station and hour, summing total delays (in minutes).
    - Prints summary statistics: total, average, and maximum delay, and record count.
    - Displays an interactive line chart of total delay by hour for each station.

    Returns
    -------
    None
        The function outputs the chart and statistics directly in the notebook.
    """
    with output_station:
        clear_output()
        df = df_station[df_station["Stopping place (FR)"].isin(stations)].copy()
        df["Total Delay"] = df["Delay at departure"].fillna(0) + df["Delay at arrival"].fillna(0)
        df = df[df["Total Delay"] > 0]
        df["Hour"] = pd.to_datetime(
            df["Actual departure time"].combine_first(df["Actual arrival time"]),
            errors="coerce"
        ).dt.hour

        grouped = df.groupby(["Stopping place (FR)", "Hour"])["Total Delay"].sum().div(60).reset_index()

        total = grouped["Total Delay"].sum().round(1)
        avg = grouped["Total Delay"].mean().round(1)
        max_ = grouped["Total Delay"].max().round(1)

        print(f"🕒 Total Delay: {total} min")
        print(f"📈 Avg Delay: {avg} min")
        print(f"🚨 Max Delay: {max_} min")
        print(f"🧾 Records: {len(grouped)}")

        fig = px.line(
            grouped,
            x="Hour",
            y="Total Delay",
            color="Stopping place (FR)",
            title="Station Delay by Hour",
            color_discrete_sequence=["#1f77b4", "#17becf", "#aec7e8", "#0066cc", "#3399ff"]
        )
        fig.show()

display(HTML("<h3 style='color:#1f77b4;'>📍 Delay by Station</h3>"))
display(widgets.VBox([station_selector, output_station]))
widgets.interactive_output(show_station_chart, {"stations": station_selector});


VBox(children=(SelectMultiple(description='Stations:', index=(100,), layout=Layout(width='50%'), options=('Aal…

In [2]:
df_train = pd.read_csv(DATA_PATH)
df_train["Train number"] = df_train["Train number"].astype(str)

train_options = sorted(df_train["Train number"].dropna().unique())
default_train = train_options[:2] if len(train_options) >= 2 else train_options[:1]

train_selector = widgets.SelectMultiple(
    options=train_options,
    value=default_train,
    description='Trains:',
    rows=10,
    layout=widgets.Layout(width='50%')
)

output_train = widgets.Output()

def show_train_chart(trains): 
    """
    Display a line chart of total train delays by hour for selected train numbers.

    Parameters
    ----------
    trains : list of str
        List of train numbers to filter and visualize.

    Functionality
    -------------
    - Filters the DataFrame for the selected train numbers.
    - Computes total delay as the sum of 'Delay at departure' and 'Delay at arrival'.
    - Extracts the hour from the actual departure or arrival time.
    - Groups by train number and hour, summing total delays (in minutes).
    - Prints summary statistics: total, average, and maximum delay, and record count.
    - Displays an interactive line chart of total delay by hour for each train.

    Returns
    -------
    None
        The function outputs the chart and statistics directly in the notebook.
    """
    
    with output_train:
        clear_output()
        df = df_train[df_train["Train number"].isin(trains)].copy()
        df["Total Delay (min)"] = df["Delay at departure"].fillna(0) + df["Delay at arrival"].fillna(0)
        df["Hour"] = pd.to_datetime(
            df["Actual departure time"].combine_first(df["Actual arrival time"]),
            errors="coerce"
        ).dt.hour

        grouped = df.groupby(["Train number", "Hour"])["Total Delay (min)"].sum().div(60).reset_index()

        total = grouped["Total Delay (min)"].sum().round(1)
        avg = grouped["Total Delay (min)"].mean().round(1)
        max_ = grouped["Total Delay (min)"].max().round(1)

        print(f"🕒 Total Delay: {total} min")
        print(f"📊 Avg Delay: {avg} min")
        print(f"🚨 Max Delay: {max_} min")
        print(f"🔢 Records: {len(grouped)}")

        fig = px.line(
            grouped,
            x="Hour",
            y="Total Delay (min)",
            color="Train number",
            title="Train Delay by Hour",
            color_discrete_sequence=["#2ca02c", "#ff7f0e", "#bcbd22", "#98df8a", "#ffbb78"]
        )
        fig.show()

display(HTML("<h3 style='color:#2ca02c;'>🚆 Delay by Train Number</h3>"))
display(widgets.VBox([train_selector, output_train]))
widgets.interactive_output(show_train_chart, {"trains": train_selector});


VBox(children=(SelectMultiple(description='Trains:', index=(0, 1), layout=Layout(width='50%'), options=('10', …

In [3]:
df_relation = df_train.copy()

relation_options = sorted(df_relation["Relation direction"].dropna().unique())
default_relations = relation_options[:3] if len(relation_options) >= 3 else relation_options

relation_selector = widgets.SelectMultiple(
    options=relation_options,
    value=default_relations,
    description='Relations:',
    rows=10,
    layout=widgets.Layout(width='50%')
)

output_relation = widgets.Output()

def show_relation_chart(relations):
    """
    Display a line chart of total train delays by hour for selected relation directions.

    Parameters
    ----------
    relations : list of str
        List of relation directions to filter and visualize.

    Functionality
    -------------
    - Filters the DataFrame for the selected relation directions.
    - Computes total delay as the sum of 'Delay at departure' and 'Delay at arrival'.
    - Extracts the hour from the actual departure or arrival time.
    - Groups by relation direction and hour, summing total delays (in minutes).
    - Prints summary statistics: total, average, and maximum delay, and record count.
    - Displays an interactive line chart of total delay by hour for each relation.

    Returns 
    -------
    None
        The function outputs the chart and statistics directly in the notebook.
    """
    with output_relation:
        clear_output()
        df = df_relation[df_relation["Relation direction"].isin(relations)].copy()
        df["Total Delay"] = df["Delay at departure"].fillna(0) + df["Delay at arrival"].fillna(0)
        df["Hour"] = pd.to_datetime(
            df["Actual departure time"].combine_first(df["Actual arrival time"]),
            errors="coerce"
        ).dt.hour

        grouped = df.groupby(["Relation direction", "Hour"])["Total Delay"].sum().div(60).reset_index()

        total = grouped["Total Delay"].sum().round(1)
        avg = grouped["Total Delay"].mean().round(1)
        max_ = grouped["Total Delay"].max().round(1)

        print(f"🕒 Total Delay: {total} min")
        print(f"📈 Avg Delay: {avg} min")
        print(f"🚨 Max Delay: {max_} min")
        print(f"🧾 Records: {len(grouped)}")

        fig = px.line(
            grouped,
            x="Hour",
            y="Total Delay",
            color="Relation direction",
            title="Relation Direction Delay",
            color_discrete_sequence=["#d62728", "#e377c2", "#ff9896", "#c49c94", "#9467bd"]
        )
        fig.show()

display(HTML("<h3 style='color:#d62728;'>🔁 Delay by Relation Direction</h3>"))
display(widgets.VBox([relation_selector, output_relation]))
widgets.interactive_output(show_relation_chart, {"relations": relation_selector});


VBox(children=(SelectMultiple(description='Relations:', index=(0, 1, 2), layout=Layout(width='50%'), options=(…

In [4]:
from scipy.interpolate import PchipInterpolator
import numpy as np

train_dropdown = widgets.Dropdown(
    options=train_options,
    value=default_train[0],
    description='Train:',
    layout=widgets.Layout(width='50%')
)

output_dist = widgets.Output()

def show_train_delay_distribution(train):
    with output_dist:
        clear_output()

        df = df_train[df_train["Train number"] == train].copy()
        df["Total Delay (min)"] = df["Delay at departure"].fillna(0) + df["Delay at arrival"].fillna(0)
        df = df.dropna(subset=["Total Delay (min)"])

        bins = list(range(0, 70, 5))
        labels = [f"{i}-{i+5}" for i in bins[:-1]]

        df["Delay Bin (min)"] = pd.cut(df["Total Delay (min)"], bins=bins, labels=labels, right=False)

        grouped = df.groupby("Delay Bin (min)", observed=True).size().reset_index(name="Frequency")
        full_df = pd.DataFrame({"Delay Bin (min)": labels})
        final = pd.merge(full_df, grouped, on="Delay Bin (min)", how="left").fillna(0)
        final["Frequency"] = final["Frequency"].astype(int)

        # Numeric x: bin centers
        bin_centers = np.array([(bins[i] + bins[i+1]) / 2 for i in range(len(bins) - 1)])
        final["Bin Center"] = bin_centers

        # Bar chart with numeric x
        fig = px.bar(
            final,
            x="Bin Center",
            y="Frequency",
            text="Frequency",
            title=f"🚆 Delay Distribution - Train {train}",
            labels={"Frequency": "Count", "Bin Center": "Delay Range (min)"},
            color_discrete_sequence=["#1f77b4"],
        )
        fig.for_each_trace(lambda t: t.update(textposition="outside"))

        # Smooth line with PCHIP interpolation
        pchip = PchipInterpolator(final["Bin Center"], final["Frequency"])
        x_smooth = np.linspace(final["Bin Center"].min(), final["Bin Center"].max(), 300)
        y_smooth = pchip(x_smooth)

        fig.add_scatter(
            x=x_smooth,
            y=y_smooth,
            mode="lines",
            name="Smooth Distribution Line",
            line=dict(color="#ff7f0e", width=2),
            showlegend=False,
        )

        # Set x-axis ticks and labels to match bins
        fig.update_layout(
            xaxis=dict(
                tickmode="array",
                tickvals=final["Bin Center"],
                ticktext=labels,
                title="Delay Range (min)",
            ),
            yaxis_title="Frequency",
            bargap=0.15,
            height=500,
        )
        fig.show()


display(HTML("<h3 style='color:#1f77b4;'>📊 Delay Distribution by Train (Aligned Smooth Line)</h3>"))
display(widgets.VBox([train_dropdown, output_dist]))
widgets.interactive_output(show_train_delay_distribution, {"train": train_dropdown})


VBox(children=(Dropdown(description='Train:', layout=Layout(width='50%'), options=('10', '10304', '10305', '10…

Output()

In [1]:
import pandas as pd
Cdf= pd.read_csv("/home/learner/Desktop/internship/Dealy_analysis/tempocom-app/merged_data.csv",delimiter=',')

In [2]:
Cdf.columns

Index(['DATDEP', 'TRAIN_NO', 'RELATION', 'TRAIN_SERV', 'PTCAR_NO', 'THOP1_COD',
       'LINE_NO_DEP', 'REAL_TIME_ARR', 'REAL_TIME_DEP', 'PLANNED_TIME_ARR',
       'PLANNED_TIME_DEP', 'DELAY_ARR', 'DELAY_DEP', 'CIRC_TYP',
       'RELATION_DIRECTION', 'PTCAR_LG_NM_NL', 'LINE_NO_ARR',
       'PLANNED_DATE_ARR', 'PLANNED_DATE_DEP', 'REAL_DATE_ARR',
       'REAL_DATE_DEP', 'Mois'],
      dtype='object')

In [3]:
from scipy.interpolate import PchipInterpolator
import numpy as np
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import plotly.express as px

# Assuming df_train is already loaded
df_train = Cdf.copy()
df_train["TRAIN_NO"] = df_train["TRAIN_NO"].astype(str)
train_options = sorted(df_train["TRAIN_NO"].dropna().unique())
default_train = train_options[0] if train_options else None

train_dropdown = widgets.Dropdown(
    options=train_options,
    value=default_train,
    description='Train:',
    layout=widgets.Layout(width='50%')
)

output_dist = widgets.Output()

def show_train_delay_distribution(train):
    with output_dist:
        clear_output()
        
        df = df_train[df_train["TRAIN_NO"] == train].copy()
        df["Total Delay (min)"] = df["DELAY_ARR"].fillna(0) + df["DELAY_DEP"].fillna(0)
        df = df.dropna(subset=["Total Delay (min)"])

        bins = list(range(0, 70, 5))
        labels = [f"{i}-{i+5}" for i in bins[:-1]]

        df["Delay Bin (min)"] = pd.cut(df["Total Delay (min)"], bins=bins, labels=labels, right=False)

        grouped = df.groupby("Delay Bin (min)", observed=True).size().reset_index(name="Frequency")
        full_df = pd.DataFrame({"Delay Bin (min)": labels})
        final = pd.merge(full_df, grouped, on="Delay Bin (min)", how="left").fillna(0)
        final["Frequency"] = final["Frequency"].astype(int)

        # Numeric x: bin centers
        bin_centers = np.array([(bins[i] + bins[i+1]) / 2 for i in range(len(bins) - 1)])
        final["Bin Center"] = bin_centers

        # Bar chart with numeric x
        fig = px.bar(
            final,
            x="Bin Center",
            y="Frequency",
            text="Frequency",
            title=f"🚆 Delay Distribution - Train {train}",
            labels={"Frequency": "Count", "Bin Center": "Delay Range (min)"},
            color_discrete_sequence=["#1f77b4"],
        )
        fig.for_each_trace(lambda t: t.update(textposition="outside"))

        # Smooth line with PCHIP interpolation
        pchip = PchipInterpolator(final["Bin Center"], final["Frequency"])
        x_smooth = np.linspace(final["Bin Center"].min(), final["Bin Center"].max(), 300)
        y_smooth = pchip(x_smooth)

        fig.add_scatter(
            x=x_smooth,
            y=y_smooth,
            mode="lines",
            name="Smooth Distribution Line",
            line=dict(color="#ff7f0e", width=2),
            showlegend=False,
        )

        # Set x-axis ticks and labels to match bins
        fig.update_layout(
            xaxis=dict(
                tickmode="array",
                tickvals=final["Bin Center"],
                ticktext=labels,
                title="Delay Range (min)",
            ),
            yaxis_title="Frequency",
            bargap=0.15,
            height=500,
        )
        fig.show()


display(HTML("<h3 style='color:#1f77b4;'>📊 Delay Distribution by Train (Aligned Smooth Line)</h3>"))
display(widgets.VBox([train_dropdown, output_dist]))
widgets.interactive_output(show_train_delay_distribution, {"train": train_dropdown})


VBox(children=(Dropdown(description='Train:', layout=Layout(width='50%'), options=('10', '10304', '10305', '10…

Output()