# Test

## Setup

In [1]:
import os
import pandas as pd
import dask.dataframe as dd
import ipywidgets as widgets
from IPython.display import display, HTML
from dask.distributed import Client, progress
import glob
from dask import delayed

In [2]:
dtypes= {
    'fahrzeiten': {
        'linie': 'Int16',
        'richtung': 'Int8',
        'fahrzeug': 'Int32',
        'kurs': 'Int16',
        'seq_von': 'Int32',
        'soll_an_von': 'Int32',
        'ist_an_von': 'Int32',
        'soll_ab_von': 'Int32',
        'ist_ab_von': 'Int32',
        'seq_nach': 'Int32',
        'soll_an_nach': 'Int32',
        'ist_an_nach': 'Int32',
        'soll_ab_nach': 'Int32',
        'ist_ab_nach': 'Int32',
        'fahrt_id': 'Int64',
        'fahrweg_id': 'Int64',
        'fw_no': 'Int16',
        'fw_typ': 'Int8',
        'fw_kurz': 'string',
        'fw_lang': 'string',
        'umlauf_von': 'Int64',
        'halt_id_von': 'Int64',
        'halt_id_nach': 'Int64',
        'halt_punkt_id_von': 'Int64',
        'halt_punkt_id_nach': 'Int64'
    },
    'haltestellen': {
        'id': 'Int64',
        'diva': 'Int64',
        'halt_kurz': 'string',
        'halt_lang': 'string'
    },
    'haltepunkte': {
        'year': 'Int64',
        'id': 'Int64',
        'diva': 'Int64',
        'halt_id': 'Int64',
        'latitude': 'float64',
        'longitude': 'float64',
        'bearing': 'float64',
        'ist_aktiv': 'bool'
    },
    'passagierfrequenz': {
        'bahnhof_kurz': 'string',
        'uic': 'Int64',
        'bahnhof_lang': 'string',
        'kanton': 'string',
        'bahnhofseigner': 'string',
        'jahr': 'Int32',
        'durchschnittlicher_täglicher_verkehr': 'Int64',
        'durchschnittlicher_werktäglicher_verkehr': 'Int64',
        'durchschnittlicher_nicht_werktäglicher_verkehr': 'Int64',
        'einbezogene_bahnunternehmen': 'string',
        'bemerkungen': 'string',
        'latitude': 'float64',
        'longitude': 'float64',
        'link': 'string'
    }
}

client = Client(n_workers=8)

passagierfrequenz_df = dd.read_csv('../../cleaned_data/passagierfrequenz.csv', assume_missing=True, dtype=dtypes['passagierfrequenz'])
haltestellen_df = dd.read_csv('../../cleaned_data/haltestellen.csv', assume_missing=True, dtype=dtypes['haltestellen'])
haltepunkte_df = dd.read_csv('../../cleaned_data/haltepunkte.csv', assume_missing=True, dtype=dtypes['haltepunkte'])
fahrzeiten_dfs = {}
lengths = []
for year in range(2016, 2023):
    file_paths = glob.glob(f'../../cleaned_data/fahrzeiten_{year}.csv/*.part')
    fahrzeiten_dfs[year] = {}
    total_rows = delayed(0)
    for file_path in file_paths:
        part_df = dd.read_csv(file_path, dtype=dtypes['fahrzeiten'], parse_dates=['betriebsdatum', 'datum_von', 'datum_nach'])
        part_length = part_df.shape[0]
        lengths.append(part_length)
        fahrzeiten_dfs[year][f'{total_rows}-{total_rows + part_length}'] = part_df
        total_rows += part_length

lengths = dd.compute(*lengths)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
2024-01-11 12:30:18,906 - distributed.worker - ERROR - Scheduler was unaware of this worker 'tcp://127.0.0.1:34237'. Shutting down.
2024-01-11 12:30:19,090 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/distributed/worker.py", line 1252, in heartbeat
    response = await retry_operation(
  File "/home/ubuntu/.local/lib/python3.10/site-packages/distributed/utils_comm.py", line 455, in retry_operation
    return await retry(
  File "/home/ubuntu/.local/

KeyboardInterrupt: 

## Visualization

In [None]:
def show_csvs():
    start = 0
    df = None
    df_length = 0
    df_columns = []

    # Create UI elements
    head_button = widgets.Button(description='Head')
    prev_button = widgets.Button(description='Previous')
    next_button = widgets.Button(description='Next')
    tail_button = widgets.Button(description='Tail')
    output = widgets.Output()
    dimensions_label = widgets.HTML()

    # Function to load DataFrame from the list
    def show_df(button_instance=None):
        nonlocal start, df, df_length, df_columns
        start = 0
        dropdown_value = dropdown.value
        if dropdown_value == 'Passagierfrequenz':
            df = passagierfrequenz_df
        elif dropdown_value == 'Haltestellen':
            df = haltestellen_df
        elif dropdown_value == 'Haltepunkte':
            df = haltepunkte_df
        elif dropdown_value.startswith('Fahrzeiten '):
            year = int(dropdown_value.replace('Fahrzeiten ', ''))
            df_parts = fahrzeiten_dfs[year]
            for key, part_df in df_parts.items():
                start_row, end_row = map(int, key.split('-'))
                if start_row <= start < end_row:
                    df = part_df.compute()
                    break
        df_length = len(df)
        df_columns = df.columns
        show_output()

    # Event handlers for button clicks
    def on_head_button_clicked(b):
        nonlocal start
        start = 0
        show_output()

    def on_prev_button_clicked(b):
        nonlocal start
        start = max(0, start-5)
        show_output()

    def on_next_button_clicked(b):
        nonlocal start, df_length
        start = min(df_length-5, start+5)
        show_output()

    def on_tail_button_clicked(b):
        nonlocal start, df_length
        start = df_length-5
        show_output()
    
    def show_output():
        nonlocal df_length, df_columns, start
        with output:
            output.clear_output()
            start_row, end_row = map(int, df.index[0], df.index[-1])
            display_start = max(start - start_row, 0)
            display_end = min(display_start + 5, end_row - start_row + 1)
            display_df = df.iloc[display_start:display_end]
            display(HTML('<div style="overflow-x: auto; white-space: nowrap;">' 
                        + display_df.to_html() + '</div>'))
            dimensions_label.value = f'<h4>Dimensions: ({df_length, len(df_columns)})</h4>'

    # Get list of dataframe names
    df_names = ['Passagierfrequenz', 'Haltestellen', 'Haltepunkte'] + [f'Fahrzeiten {year}' for year in fahrzeiten_dfs.keys()]
    
    # Create dropdown and show button
    dropdown = widgets.Dropdown(options=df_names)
    show_button = widgets.Button(description='Show')
    
    # Display UI elements
    title = widgets.HTML('<h2 style="text-align: center;">CSVs</h2>')
    box_layout = widgets.Layout(display='flex', justify_content='center')
    display(
        widgets.VBox(
            [
                title, widgets.HBox([dropdown, show_button], layout=box_layout),
                output,
                widgets.HBox([dimensions_label], layout=widgets.Layout(justify_content='flex-start')),
                widgets.HBox([head_button, prev_button, next_button, tail_button], layout=box_layout)
            ],
            layout=box_layout
        )
    )
    
    # Attach event handlers to buttons
    show_button.on_click(show_df)
    head_button.on_click(on_head_button_clicked)
    prev_button.on_click(on_prev_button_clicked)
    next_button.on_click(on_next_button_clicked)
    tail_button.on_click(on_tail_button_clicked)
    
show_csvs()