# Pipeline

This pipeline is intended to simplify to whole process of loading a dataset, creating gaps of different types in it, imputing the missing data and evaluating the imputation method.
You may want to edit the cells preceded by an <span style="color: red">**EDIT:**</span> sign to fit your needs.

## Install dependencies

In [None]:
!pip install openpyxl
!pip install jupyterlab-widgets
!pip install jsfileupload
!pip install pyxlsb
!pip install sklearn
!pip install scipy

## Set an arbitrary random state

In [None]:
import random

random.seed(0)

## Load clean data

> The upload form only supports files up to 10Mo. For larger files, please upload them directly to JupyterHub and provide a relative link to them in the text input herebelow.

<span style="color:red">**EDIT**: customize the dataset configuration.</span>

In [None]:
from config import config
import pandas as pd

# Define global variables for later use
dataset_config = None
filename: str = None
buffer: bytes = None
xl: pd.ExcelFile = None
sheet: pd.DataFrame = None
sheet_name: str = None
df: pd.DataFrame = None
column_name: str = None

%run helpers/load_data.ipynb

In [None]:
df = df.head(10000)
# df.scale()
df

## Create gaps of different sizes

<span style="color:red">**EDIT**: customize gaps sizes.</span>

In [None]:
import numpy as np

gaps_indices = []

def create_gaps(df: pd.DataFrame, gaps_ratio: float, min_gap_size: int, max_gap_size: int):
    global gaps_indices, dataset_config
    indices_to_remove: [int] = []
    gaps_locations = sorted(random.sample(
        range(1, len(df) + 1),
        int(len(df) * gaps_ratio)
    ))
    df_with_gaps = df.copy()

    for i, gap_start in enumerate(gaps_locations):
        gap_end = min(
            gap_start + random.randint(min_gap_size, max_gap_size),
            len(df) - 1
        )
        if len(gaps_locations) >= i + 2 and gap_end + dataset_config["min_gap_distance"] >= gaps_locations[i + 1]:
            continue
        indices_to_remove.append([df.index[i] for i in range(gap_start, gap_end)])
        df_with_gaps.loc[indices_to_remove[-1]] = np.nan
    gaps_indices.append(indices_to_remove)
    return df_with_gaps

gaps_status = widgets.HTML(value="")
display(gaps_status)

dfs_with_gaps: [pd.DataFrame] = []
for i in range(len(dataset_config["gaps"])):
    gaps_status.value = f"Creating gaps... ({i}/{len(dataset_config['gaps'])})"
    min_gap_size, max_gap_size, gaps_ratio = dataset_config["gaps"][i]
    dfs_with_gaps.append(create_gaps(df, gaps_ratio, min_gap_size, max_gap_size))
gaps_status.value = "Gaps created"

## Run the imputation

<span style="color:red">**EDIT**: create your own imputation strategy.</span>

In [None]:
%run imputers/interpolate.ipynb
%run imputers/median.ipynb
%run imputers/mode.ipynb
%run imputers/mean.ipynb
%run imputers/knn.ipynb
%run imputers/hotdeck.ipynb

imputers = {
    "interpolation": {
        "title": "Interpolation",
        "function": interpolate
    },
    "median imputation": {
        "title": "Median imputation",
        "function": median_imputation
    },
    "mode imputation": {
        "title": "Mode imputation",
        "function": mode_imputation
    },
    "mean imputation": {
        "title": "Mean imputation",
        "function": mean_imputation
    },
    "KNN imputation": {
        "title": "K-Nearest-Neighbors imputation",
        "function": KNNImputation
    },
    "hotdeck imputation": {
        "title": "Hot-deck imputation",
        "function": hotdeck
    }
}

imputer_select = widgets.Dropdown(
    options=list(imputers.keys()),
    value=None,
    description='Imputer:'
)
display(imputer_select)
imputer = None
imputed_dfs: [pd.DataFrame] = None

imputation_status = widgets.HTML(value="")
custom_progress_status = widgets.HTML(value="")
display(imputation_status)
display(custom_progress_status)

puf = parse_uploaded_file_sync

def on_imputer_select_change(evt):
    global imputed_dfs, imputer
    imputer = imputers[evt.new]
    imputed_dfs = []
    for i in range(len(dfs_with_gaps)):
        imputation_status.value = f"Running imputation... ({i}/{len(dfs_with_gaps)})"
        dataset_config['current_gap_indices'] = gaps_indices[i]
        imputed_dfs.append(imputer["function"](dfs_with_gaps[i], dataset_config, custom_progress_status))
    imputation_status.value = "Imputation complete"
imputer_select.observe(on_imputer_select_change, 'value')

## Plot the imputation results

In [None]:
%matplotlib notebook

def plot_imputation(gapped: pd.DataFrame, imputed: pd.DataFrame, y_label: str, title="Untitled"):
    global df
    fig, ax = plt.subplots()
    plt.title(title)
    plt.xlabel("Time")
    plt.ylabel(y_label)
    plt.plot(df, c="green", label="Reference data")
    plt.plot(imputed, c="red", label="Imputed data")
    plt.plot(gapped, c="cyan", label="Data with gaps")
#     plt.plot(df - imputed, c="blue", label="Error")
    ax.legend()
    plt.show()

for i in range(len(imputed_dfs)):
    plot_imputation(dfs_with_gaps[i], imputed_dfs[i], column_name, f"{imputer['title']} with gap type {i + 1}")

## Evaluate the imputation results

In [None]:
%run helpers/evaluate.ipynb

for i in range(len(imputed_dfs)):
    flattened_indices = [it for sublist in gaps_indices[i] for it in sublist]
    ref_values = [df.iloc[:, 0][index] for index in flattened_indices]
    pred_values = [imputed_dfs[i].iloc[:, 0][index]
                   for index in flattened_indices]

    errors = [ref_values[i] - pred_values[i] for i in range(len(ref_values))]
    abs_errors = [abs(it) for it in errors]

    title = f"Interpolation with gap type {i + 1} [{dataset_config['gaps'][i][0]};{dataset_config['gaps'][i][1]}]"
    results = {
        "Mean Squared Error": mean_squared_error(df, imputed_dfs[i]),
        "Raw Bias": raw_bias(errors),
        "Absolute Raw Bias": abs_raw_bias(abs_errors),
        "Percent Bias": percent_bias(ref_values, pred_values),
        "Errors sum": sum_error(abs_errors),
        "Maximum error": max_error(errors),
        "Variance error": variance(df) - variance(imputed_dfs[i]),
        "Kurtosis error": kurtosis(df) - kurtosis(imputed_dfs[i]),
        "Skewness error": skewness(df) - skewness(imputed_dfs[i]),
    }

    print(title)
    pretty_print(results, indent=1)
    print("")

#     plot_error_distribution(errors, title)