# Pipeline

TODO

## Install dependencies

In [None]:
!pip install openpyxl > /dev/null 2>&1
!pip install jupyterlab-widgets > /dev/null 2>&1
!pip install jsfileupload > /dev/null 2>&1
!pip install pyxlsb > /dev/null 2>&1
!pip install sklearn > /dev/null 2>&1
!pip install scipy > /dev/null 2>&1

## Set an arbitrary random state

In [None]:
import random
random.seed(0)

## Run the pre-imputation script

AKA load the data, truncate it and create gaps

In [None]:
%run helpers/pre_imputation.ipynb

## Define the hotdeck imputer

In [None]:
%run helpers/hotdeck.ipynb

# Config
step = timedelta(seconds=300)
surrounding_duration = timedelta(seconds=3600)

# Internal globals
progress_bar = None
cached_donors = None
start_time = 0
donors = None

In [None]:
async def hotdeck(receiver: pd.DataFrame, config: object) -> pd.DataFrame:
    global file_select, donors, start_time, total_gaps, progress_bar

    custom_progress_status = widgets.HTML()
    if cached_donors == None:
        donors = await prompt_donor_selection()
        display(custom_progress_status)
        load_donors_in_cache(donors, custom_progress_status)
    else:
        display(custom_progress_status)

    receiver_cp = receiver.copy()
    gap_indices_cp = config["current_gap_indices"].copy()

    # Init global variables
    start_time = datetime.now()
    progress_bar = widgets.IntProgress(min=0, max=len(gap_indices_cp), value=0)
    display(progress_bar)

    threads = start_workers(receiver_cp, gap_indices_cp, custom_progress_status)
    impute(receiver_cp, gap_indices_cp, custom_progress_status)
    for thread in threads:
        thread.join()
    custom_progress_status.value = f"Time taken: {(datetime.now() - start_time)}"
    return receiver_cp


def impute(receiver: pd.DataFrame, gap_indices: [[datetime]], custom_progress_status: widgets.HTML) -> None:
    global surrounding_duration, column_name, donors

    while (gap := pop_gap(gap_indices)) != None:
        display_progress(custom_progress_status, len(gap_indices))

        gap_start_idx, gap_end_idx = get_gap_boundaries(receiver, gap[0], gap[-1])
        gap_start_time = receiver.index[gap_start_idx]
        gap_end_time = receiver.index[gap_end_idx]

        duration_before, duration_after = get_sampling_durations(receiver, gap_start_idx, gap_end_idx, gap_start_time, gap_end_time)

        before = get_normalized_dataframe(receiver, gap_start_time - duration_before, gap_start_time)
        after = get_normalized_dataframe(receiver, gap_end_time, gap_end_time + duration_after)

        donor_start_time = gap_start_time - (duration_before + surrounding_duration)
        donor_end_time = gap_end_time + (duration_after + surrounding_duration)

        scoreboard = []

        for file in donors:
            donor = get_donor(file, donor_start_time, donor_end_time)
            if len(donor.index) != 0:
                scoreboard += scan_donor(before.copy(), after.copy(), file, donor)

        fill_gap(receiver, gap, gap_start_time, gap_end_time, scoreboard)


def scan_donor(before: pd.DataFrame, after: pd.DataFrame, donor_filename: str, donor: pd.DataFrame) -> [dict]:
    global step, column_name

    scores = []
    original_sample_mean = (before[column_name].mean() + after[column_name].mean()) / 2

    # Shift the comparison sample the start of the donor sample
    x_offset = before.index[0] - donor.index[0]
    x_offset -= x_offset % step
    before.index = before.index - x_offset
    after.index = after.index - x_offset

    while after.index[-1] <= donor.index[-1]:
        # Donor comparison samples
        donor_before = get_normalized_dataframe(donor, before.index[0], before.index[-1])
        donor_after = get_normalized_dataframe(donor, after.index[0], after.index[-1])

        # We need to take into account the previous Y-axis shifting
        y_offset, adjusted_y_offset = get_y_offsets(original_sample_mean, before, after, donor_before, donor_after)

        # Apply the offset
        before[column_name] = before[column_name] + adjusted_y_offset
        after[column_name] = after[column_name] + adjusted_y_offset

        scores.append({
            "score": get_similarity_score(before, after, donor_before, donor_after),
            "x_offset": x_offset,
            "y_offset": y_offset,
            "start": before.index[0],
            "end": after.index[-1],
            "filename": donor_filename
        })

        # Shift the comparison sample to the next step
        x_offset -= step
        before.index = before.index + step
        after.index = after.index + step

    return scores

## Run the imputation

In [None]:
import asyncio

imputation_status = widgets.HTML(value="")
display(imputation_status)

imputed_dfs = []

async def do_imputation():
    global imputed_dfs
    for i in range(len(dfs_with_gaps)):
        imputation_status.value = f"Running imputation... ({i}/{len(dfs_with_gaps)})"
        dataset_config['current_gap_indices'] = gaps_indices[i]
        result = await hotdeck(dfs_with_gaps[i], dataset_config)
        imputed_dfs.append(result)
    imputation_status.value = "Imputation complete"

_ = asyncio.ensure_future(do_imputation())

## Run the post imputation script

AKA visualize and evaluate the results

In [None]:
%run helpers/evaluate.ipynb