In [2]:
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
import pandas as pd
import threading
from datetime import datetime, timedelta
from ipywidgets import widgets
import time
from scipy import stats


def fill_gap(best: dict, receiver: pd.DataFrame, gap_indices: [datetime]) -> None:
    global column_name
    donor = get_donor(best["filename"], best["start"], best["end"])
    donor.index += best["x_offset"]
    donor[column_name] *= best["ratio"]
    donor[column_name] += best["y_offset"]
    donor = donor[
        (donor[column_name] > donor[column_name].quantile(0.05)) &
        (donor[column_name] < donor[column_name].quantile(0.95))
    ]

    for gap_idx in gap_indices:
        if gap_idx not in donor.index:
            donor.loc[gap_idx] = [np.nan]
    donor.sort_index(inplace=True)
    donor.interpolate(method="index", inplace=True)
    for gap_idx in gap_indices:
        receiver[column_name][gap_idx] = donor[column_name][gap_idx] 


def hotdeck_prehook(next_step):
    global file_select

    if cached_donors != None:
        next_step()
        return

    files = filter_compatible_files(file_select.options)
    donors_selector = widgets.SelectMultiple(
        options=files,
        description="Select donors: ",
        rows=len(files)
    )
    confirm_button = widgets.Button(description="Confirm selection")
    display(donors_selector)
    display(confirm_button)

    def callback(e):
        global donors
        nonlocal confirm_button, donors_selector
        confirm_button.close()
        donors_selector.disabled = True
        donors = donors_selector.value
        next_step()
    confirm_button.on_click(callback)


def filter_compatible_files(files: [str]) -> [str]:
    global sheet_name
    return [file for file in files if ".csv" in file or sheet_name in pd.ExcelFile(config["upload_dir"] + file, engine='openpyxl').sheet_names]


def get_normalized_dataframe(df: pd.DataFrame, start_time: datetime, end_time: datetime) -> pd.DataFrame:
    start_idx = df.index.get_loc(start_time if start_time >= df.index[0] else df.index[0], 'pad')
    view = df[df.index >= df.index[start_idx]]

    if end_time < view.index[-1]:
        end_idx = view.index.get_loc(end_time, 'bfill')
        view = view[view.index <= view.index[end_idx]]

    start_time_missing = view.index[0] != start_time
    end_time_missing = view.index[-1] != end_time

    cp = view.copy()
    if start_time_missing or end_time_missing:
        if start_time_missing:
            cp.loc[start_time] = [np.nan]
        if end_time_missing:
            cp.loc[end_time] = [np.nan]
        cp.sort_index(inplace=True)
        cp.interpolate(method="index", inplace=True)
    cp = cp[cp.index >= start_time]
    cp = cp[cp.index <= end_time]
    return cp


def get_donor(filepath: str, start_time: datetime = None, end_time: datetime = None) -> None:
    global cached_donors
    donor = cached_donors[filepath].copy()
    if start_time != None:
        donor = donor[donor.index >= start_time]
    if end_time != None:
        donor = donor[donor.index <= end_time]
    return donor


def load_donors_in_cache(donors: [str], custom_progress_status: widgets.HTML):
    global sheet_name, column_name, cached_donors
    custom_progress_status.value = f"Hotdeck starting, loading {len(donors)} donors..."
    cached_donors = dict()
    for filename in donors:
        with open(config["upload_dir"] + filename, 'rb') as file:
            donor = parse_uploaded_file_sync(filename, file.read(), sheet_name, column_name)
            cached_donors[filename] = donor


def get_gap_boundaries(df: pd.DataFrame, gap_start_time: datetime, gap_end_time: datetime) -> tuple:
    gap_start_idx = df.index.get_loc(gap_start_time) - 1
    gap_end_idx = df.index.get_loc(gap_end_time) + 1

    if gap_start_idx < 0:
        gap_start_idx = 0
    if gap_end_idx >= len(df):
        gap_end_idx = len(df) - 1

    return gap_start_idx, gap_end_idx


def get_sampling_durations(receiver: pd.DataFrame, gap_start_idx: int, gap_end_idx: int, gap_start_time: datetime, gap_end_time: datetime) -> tuple:
    global column_name
    gap_duration = gap_end_time - gap_start_time
    max_duration = gap_duration if gap_duration > timedelta(minutes=20) else timedelta(minutes=20)
    index_count = len(receiver.index)

    duration_before = timedelta(seconds=0)
    while gap_start_idx > 0 \
            and duration_before < max_duration \
            and np.isnan(receiver[column_name][receiver.index[gap_start_idx]]) == False:
        duration_before = gap_start_time - receiver.index[gap_start_idx]
        gap_start_idx -= 1

    duration_after = timedelta(seconds=0)
    while gap_end_idx < index_count \
            and duration_after < max_duration \
            and np.isnan(receiver[column_name][receiver.index[gap_end_idx]]) == False:
        duration_after = receiver.index[gap_end_idx] - gap_end_time
        gap_end_idx += 1

    return duration_before, duration_after


def run_ui(gap_indices_count: int, custom_progress_status: widgets.HTML):
    global gaps_done
    start_time = datetime.now()
    progress_bar = widgets.IntProgress(min=0, max=gap_indices_count, value=0)
    display(progress_bar)
    while gaps_done != progress_bar.max:
        display_progress(start_time, custom_progress_status, progress_bar)
        time.sleep(0.5)
    display_progress(start_time, custom_progress_status, progress_bar)
    custom_progress_status.value = f"Time taken: {(datetime.now() - start_time)}"


def display_progress(start_time: datetime, custom_progress_status: widgets.HTML, progress_bar) -> None:
    global gaps_done
    progress_bar.description = "%d/%d: " % (gaps_done, progress_bar.max)
    progress_bar.value = gaps_done
    elapsed = datetime.now() - start_time
    total_time_estimate = (elapsed / (gaps_done + 1)) * progress_bar.max
    eta = total_time_estimate - elapsed
    eta -= timedelta(microseconds=eta.microseconds)
    elapsed -= timedelta(microseconds=elapsed.microseconds)
    custom_progress_status.value = f"ETA: {eta} ({elapsed} elapsed)"