In [None]:
import numpy as np
import pandas as pd
import threading
import multiprocessing
from datetime import datetime, timedelta
from scipy.spatial.distance import directed_hausdorff
from ipywidgets import widgets
import time

gap_indices_lock = threading.RLock()

def fill_gap(receiver: pd.DataFrame, gap: [datetime], gap_start_time: datetime, gap_end_time: datetime, scoreboard: [dict]) -> None:
    global column_name

    if len(scoreboard) != 0:
        scoreboard.sort(key=lambda it: it["score"])
        best = scoreboard[0]
        donor = get_donor(best["filename"], best["start"], best["end"])
        donor.index = donor.index + best["x_offset"]
        donor[column_name] = donor[column_name] - best["y_offset"]
    else:
        donor = pd.DataFrame({
            column_name: [
                receiver[column_name][gap_start_time],
                receiver[column_name][gap_end_time]
            ]},
            index=[gap_start_time, gap_end_time]
        )

    transpose_data(donor, receiver, gap)


def filter_compatible_files(files: [str]) -> [str]:
    global sheet_name
    return [file for file in files if ".csv" in file or sheet_name in pd.ExcelFile(config["upload_dir"] + file, engine='openpyxl').sheet_names]


def get_similarity_score(before: pd.DataFrame, after: pd.DataFrame, donor_before: pd.DataFrame, donor_after: pd.DataFrame) -> float:
    return max([
        directed_hausdorff(before, donor_before)[0] + directed_hausdorff(after, donor_after)[0],
        directed_hausdorff(donor_before, before)[0] + directed_hausdorff(donor_after, after)[0]
    ])


def get_y_offsets(original_sample_mean: float, before: pd.DataFrame, after: pd.DataFrame, donor_before: pd.DataFrame, donor_after: pd.DataFrame) -> tuple:
    global column_name

    mean_receiver = (before[column_name].mean() + after[column_name].mean()) / 2
    mean_donor = (donor_before[column_name].mean() + donor_after[column_name].mean()) / 2
    adjusted_y_offset = mean_donor - mean_receiver
    y_offset = adjusted_y_offset - (original_sample_mean - mean_receiver)
    return y_offset, adjusted_y_offset


def get_normalized_dataframe(df: pd.DataFrame, start_time: datetime, end_time: datetime) -> pd.DataFrame:
    start_idx = df.index.get_loc(start_time if start_time >= df.index[0] else df.index[0], 'pad')
    view = df[df.index >= df.index[start_idx]]

    if end_time < view.index[-1]:
        end_idx = view.index.get_loc(end_time, 'bfill')
        view = view[view.index <= view.index[end_idx]]

    start_time_missing = view.index[0] != start_time
    end_time_missing = view.index[-1] != end_time

    cp = view.copy()

    if start_time_missing or end_time_missing:
        if start_time_missing:
            cp.loc[start_time] = [np.nan]
        if end_time_missing:
            cp.loc[end_time] = [np.nan]
        cp.sort_index(inplace=True)
        cp.interpolate(method="index", inplace=True)
        cp = cp[cp.index >= start_time]
        cp = cp[cp.index <= end_time]

    return cp


def get_donor(filepath: str, start_time: datetime = None, end_time: datetime = None) -> None:
    global cached_donors

    donor = cached_donors[filepath].copy()
    if start_time != None:
        donor = donor[donor.index >= start_time]
    if end_time != None:
        donor = donor[donor.index <= end_time]
    return donor


def load_donors_in_cache(donors: [str], custom_progress_status: widgets.HTML):
    global sheet_name, column_name, cached_donors

    custom_progress_status.value = f"Hotdeck starting, loading {len(donors)} donors..."
    cached_donors = dict()
    for filename in donors:
        with open(config["upload_dir"] + filename, 'rb') as file:
            cached_donors[filename] = parse_uploaded_file_sync(filename, file.read(), sheet_name, column_name)


def transpose_data(donor: str, receiver: str, gap_indices: [datetime]) -> None:
    global column_name

    for gap_idx in gap_indices:
        if gap_idx not in donor.index:
            donor.loc[gap_idx] = [np.nan]

    donor.sort_index(inplace=True)
    donor.interpolate(method="index", inplace=True)

    for gap_idx in gap_indices:
        receiver[column_name][gap_idx] = donor[column_name][gap_idx]


def get_gap_boundaries(df: pd.DataFrame, gap_start_time: datetime, gap_end_time: datetime) -> tuple:
    gap_start_idx = df.index.get_loc(gap_start_time) - 1
    gap_end_idx = df.index.get_loc(gap_end_time) + 1

    if gap_start_idx < 0:
        gap_start_idx = 0
    if gap_end_idx >= len(df):
        gap_end_idx = len(df) - 1

    return gap_start_idx, gap_end_idx


def pop_gap(gap_indices: [[datetime]]) -> [datetime]:
    global gap_indices_lock
    with gap_indices_lock:
        gaps_left = len(gap_indices)
        if gaps_left > 0:
            return gap_indices.pop(0)
    return None


def get_sampling_durations(receiver: pd.DataFrame, gap_start_idx: int, gap_end_idx: int, gap_start_time: datetime, gap_end_time: datetime) -> tuple:
    global column_name

    gap_duration = gap_end_time - gap_start_time
    max_duration = gap_duration * 1.5 if gap_duration > timedelta(seconds=1800) else timedelta(seconds=1800)
    index_count = len(receiver.index)

    duration_before = timedelta(seconds=0)
    while gap_start_idx > 0 \
            and duration_before < max_duration \
            and np.isnan(receiver[column_name][receiver.index[gap_start_idx]]) == False:
        duration_before = gap_start_time - receiver.index[gap_start_idx]
        gap_start_idx -= 1

    duration_after = timedelta(seconds=0)
    while gap_end_idx < index_count \
            and duration_after < max_duration \
            and np.isnan(receiver[column_name][receiver.index[gap_end_idx]]) == False:
        duration_after = receiver.index[gap_end_idx] - gap_end_time
        gap_end_idx += 1

    return duration_before, duration_after


def display_progress(custom_progress_status: widgets.HTML, gaps_left: int) -> None:
    global start_time, progress_bar
    gaps_done = progress_bar.max - gaps_left
    progress_bar.description = "%d/%d: " % (gaps_done, progress_bar.max)
    progress_bar.value = gaps_done

    elapsed = datetime.now() - start_time
    total_time_estimate = (elapsed / (progress_bar.max - gaps_left + 1)) * progress_bar.max
    eta = total_time_estimate - elapsed
    eta -= timedelta(microseconds=eta.microseconds)
    elapsed -= timedelta(microseconds=elapsed.microseconds)
    custom_progress_status.value = f"ETA: {eta} ({elapsed} elapsed)"


def start_workers(receiver: pd.DataFrame, gap_indices: [[datetime]], custom_progress_status: widgets.HTML) -> [threading.Thread]:
    threads = []
    thread_count = max(multiprocessing.cpu_count() - 1, 31)
    for idx in range(thread_count - 1):
        threads.append(threading.Thread(
            target=impute,
            args=(receiver, gap_indices, custom_progress_status)
        ))
        threads[idx].start()
    return threads
