# Pipeline

TODO

## Install dependencies

In [None]:
!pip install openpyxl > /dev/null 2>&1
!pip install jupyterlab-widgets > /dev/null 2>&1
!pip install jsfileupload > /dev/null 2>&1
!pip install pyxlsb > /dev/null 2>&1
!pip install sklearn > /dev/null 2>&1
!pip install scipy > /dev/null 2>&1

## Set an arbitrary random state

In [None]:
import random
random.seed(0)

## Run the pre-imputation script

AKA load the data, truncate it and create gaps

In [None]:
%run helpers/pre_imputation.ipynb

In [None]:
df = df.head(10000)
df

In [None]:
%run helpers/create_gaps.ipynb

## Define the hotdeck imputer

In [None]:
from datetime import timedelta

# Config
surrounding_duration = timedelta(days=15)

# Internal globals
cached_donors = None
donors = None
gaps_done = 0

In [None]:
# Import the helpers
%run helpers/hotdeck.ipynb

In [None]:
def hotdeck(receiver: pd.DataFrame, config: object) -> pd.DataFrame:
    global file_select, donors, gaps_done

    custom_progress_status = widgets.HTML()
    display(custom_progress_status)

    if cached_donors == None:
        load_donors_in_cache(donors, custom_progress_status)

    gaps_done = 0
    receiver_cp = receiver.copy()
    gap_indices = config["current_gap_indices"]

    ui_thread = threading.Thread(target=run_ui, args=(len(gap_indices), custom_progress_status))
    ui_thread.start()
    impute(receiver_cp, gap_indices)
    ui_thread.join()

    return receiver_cp


def impute(receiver: pd.DataFrame, gap_indices: [[datetime]]) -> None:
    global surrounding_duration, column_name, donors, gaps_done

    for gap in gap_indices:
        gap_start_idx, gap_end_idx = get_gap_boundaries(receiver, gap[0], gap[-1])
        gap_start_time = receiver.index[gap_start_idx]
        gap_end_time = receiver.index[gap_end_idx]

        duration_before, duration_after = get_sampling_durations(receiver, gap_start_idx, gap_end_idx, gap_start_time, gap_end_time)

        before = get_normalized_dataframe(receiver, gap_start_time - duration_before, gap_start_time)
        after = get_normalized_dataframe(receiver, gap_end_time, gap_end_time + duration_after)

        donor_start_time = gap_start_time - (duration_before + surrounding_duration)
        donor_end_time = gap_end_time + (duration_after + surrounding_duration)

        scoreboard = []
        for file in donors:
            donor = get_donor(file, donor_start_time, donor_end_time)
            if len(donor.index) != 0:
                scoreboard.append(scan_donor(before.copy(), after.copy(), file, donor))
        if len(scoreboard) != 0:
            scoreboard.sort(key=lambda it: it["score"])
            fill_gap(scoreboard[0], receiver, gap)
        gaps_done += 1

    receiver.interpolate(method="index", inplace=True)


def scan_donor(before: pd.DataFrame, after: pd.DataFrame, donor_filename: str, donor: pd.DataFrame) -> [dict]:
    global column_name

    keys = np.concatenate([before[column_name].values, after[column_name].values])

    gap_size = after.index[0] - before.index[-1]
    before_size = before.index[-1] - before.index[0]
    after_size = after.index[-1] - after.index[0]

    donor_before = get_normalized_dataframe(donor, donor.index[0], donor.index[-1] - (after_size + gap_size))
    donor_after = get_normalized_dataframe(donor, donor.index[0] + (gap_size + before_size), donor.index[-1])

    sliding_before = sliding_window_view(donor_before[column_name].values, window_shape=len(before.index))
    sliding_after = sliding_window_view(donor_after[column_name].values, window_shape=len(after.index))

    length = min(len(sliding_before), len(sliding_after))
    matrix = np.concatenate([sliding_before[:length], sliding_after[:length]], axis=1)

#     matrix, y_offsets, ratios = compare_scale(keys, matrix)
    matrix, y_offsets = compare_diff(keys, matrix)

    matrix -= keys
    matrix = np.absolute(matrix)

    comp = matrix.sum(axis=1)
    best = np.argsort(comp)[0]

    return {
        "score": comp[best],
        "x_offset": before.index[0] - donor.index[best],
        "y_offset": y_offsets[best],
        "ratio": 1, #ratios[best],
        "start": donor.index[best],
        "end": donor.index[best] + before_size + after_size + gap_size,
        "filename": donor_filename
    }


def compare_diff(keys, matrix):
    y_offsets = keys.mean() - matrix.mean(axis=1)
    matrix = np.array([matrix[i] + y_offsets[i] for i in range(len(y_offsets))])
    return matrix, y_offsets


def compare_scale(keys, matrix):
    # Scale it
    keys_amp = abs(keys.max() - keys.min())
    mins = matrix.min(axis=1)
    maxs = matrix.max(axis=1)
    mamps = np.absolute(maxs - mins)
    ratios = np.ones((len(mamps)))
    ratios = np.divide(keys_amp, mamps, out=ratios, where=mamps != 0)
    matrix = np.array([matrix[i] * ratios[i] for i in range(len(ratios))])
    # Shift it by the mean
    matrix, y_offsets = compare_diff(keys, matrix)
    return matrix, y_offsets, ratios


## Run the imputation

In [None]:
imputation_status = widgets.HTML(value="")
display(imputation_status)

imputed_dfs = []


def do_imputation():
    global imputed_dfs
    for i in range(len(dfs_with_gaps)):
        imputation_status.value = f"Running imputation... ({i}/{len(dfs_with_gaps)})"
        dataset_config['current_gap_indices'] = gaps_indices[i]
        result = hotdeck(dfs_with_gaps[i], dataset_config)
        imputed_dfs.append(result)
    imputation_status.value = "Imputation complete"


hotdeck_prehook(do_imputation)

## Run the post imputation script

AKA visualize and evaluate the results

In [None]:
%run helpers/evaluate.ipynb