# Imputation

This ipynb compares ExtraMAE with benchmarks in imputation. **Full model files and supporting documents for running this note is available upon request.** 

In [1]:
import os
import json
import argparse
import numpy as np
import pandas as pd
from modules.utils import extract_factors, load_dict_npy, save_under_impute, load_under_impute, mse_loss
from sklearn.metrics import mean_absolute_error

# Stock

In [2]:
def load_arguments(config_name):
    home = os.getcwd()
    
    # Load the config.json
    config_dir = os.path.join(home, config_name)

    with open(config_dir, 'r') as f:
        config_dict = json.load(fp=f)

    # Maintain dirs
    storage_dir = os.path.join(home, 'storage')
    experiment_dir = os.path.join(storage_dir, config_dict['experiment_name'])
    config_dict['experiment_dir'] = experiment_dir


    args = argparse.Namespace(**config_dict)

    return args

In [3]:
def summary_by_metrics(experiment_name):
    home = os.getcwd()
    args = load_arguments('stock_config.json')
    factors = extract_factors(args.ts_size)
    metrics_list = []

    hyper_names = ['mask_size', 'num_masks']
    mse_list = list()
    mae_list = list()
    
    for mask_size in factors:
        for num_masks in range(1, 2):
            # Find the right config for the instance
            hyper_values = [mask_size, num_masks]
            hyper_comb = dict(zip(hyper_names, hyper_values))
            instance_name = ','.join(['%s=%.4g' % (name, value) if isinstance(value, float)
                                      else '%s=%s' % (name, str(value).replace(' ', '_'))
                                      for name, value in hyper_comb.items()])
            
            storage_dir = os.path.join(home, 'storage')
            experiment_dir = os.path.join(storage_dir, experiment_name)
            instance_dir = os.path.join(args.experiment_dir, instance_name)
            impute_dir = os.path.join(instance_dir, 'impute')
            args.impute_dir = impute_dir
            
            data_mean_impute = load_under_impute(args, 'mean_impute.npy')
            data_median_impute = load_under_impute(args, 'median_impute.npy')
            data_knn_impute = load_under_impute(args, 'knn_impute.npy')
            data_soft_impute = load_under_impute(args, 'soft_impute.npy')
            data_mai_impute = load_under_impute(args, 'mai_impute.npy')
            data_brits_impute = load_under_impute(args, 'brits_impute.npy')
            
            data_original = load_under_impute(args, 'ori_impute.npy')
            masks = load_under_impute(args, 'masks.npy')

            mse_mean_impute = mse_loss(data_mean_impute, data_original, masks)
            mse_median_impute = mse_loss(data_median_impute, data_original, masks)
            mse_knn_impute = mse_loss(data_knn_impute, data_original, masks)
            mse_soft_impute = mse_loss(data_soft_impute, data_original, masks)
            mse_mai_impute = mse_loss(data_mai_impute, data_original, masks)
            mse_brits_impute = mse_loss(data_brits_impute, data_original, masks)

            mae_mean_impute = mean_absolute_error(data_mean_impute[masks], data_original[masks])
            mae_median_impute = mean_absolute_error(data_median_impute[masks], data_original[masks])
            mae_knn_impute = mean_absolute_error(data_knn_impute[masks], data_original[masks])
            mae_soft_impute = mean_absolute_error(data_soft_impute[masks], data_original[masks])
            mae_mai_impute = mean_absolute_error(data_mai_impute[masks], data_original[masks])
            mae_brits_impute = mean_absolute_error(data_brits_impute[masks], data_original[masks])
            
            mse = [mse_mean_impute, mse_median_impute, mse_soft_impute, mse_knn_impute, mse_brits_impute, mse_mai_impute]
            mae = [mae_mean_impute, mae_median_impute, mae_soft_impute, mae_knn_impute, mae_brits_impute, mae_mai_impute]
            
            metrics_dict = {"mse": mse, 
                            "mae": mae}
            metrics_df = pd.DataFrame(metrics_dict, index=['mean', 'median', 'soft', 'knn', 'brits', 'mai'])
            save_under_impute(args, 'impute_metrics.npy', metrics_df)
            instance_name_short = f's{mask_size}n{num_masks}'
            # print(f'{instance_name_short} results:\n{metrics_df}\n')
            
            metrics_np = load_under_impute(args, 'impute_metrics.npy')
            temp_mse = metrics_np[:, 0].reshape(1, 6)
            temp_mae = metrics_np[:, 1].reshape(1, 6)
            mse_df = pd.DataFrame(temp_mse, columns=['mean', 'median', 'soft', 'knn', 'brits', 'mai'], index=[instance_name_short])
            mae_df = pd.DataFrame(temp_mae, columns=['mean', 'median', 'soft', 'knn', 'brits', 'mai'], index=[instance_name_short])
            mse_list.append(mse_df)
            mae_list.append(mae_df)
            
    mse_results = pd.concat(mse_list)
    mae_results = pd.concat(mae_list)
    return mse_results, mae_results
            

In [4]:
mse_results, mae_results = summary_by_metrics('AE_0_EM_0_RE50000')

In [5]:
mse_mean = mse_results.mean()
mse_std = mse_results.std()
mse_summary = pd.concat([mse_mean, mse_std], axis=1)
mse_summary.columns = ['mse_mean', 'mse_std']
print(mse_summary)

        mse_mean   mse_std
mean    0.053171  0.000157
median  0.065231  0.000200
soft    0.021450  0.054681
knn     0.021441  0.054685
brits   0.021618  0.003936
mai     0.000785  0.000185


In [6]:
mae_mean = mae_results.mean()
mae_std = mae_results.std()
mae_summary = pd.concat([mae_mean, mae_std], axis=1)
mae_summary.columns = ['mae_mean', 'mae_std']
print(mae_summary)

        mae_mean   mae_std
mean    0.184434  0.000265
median  0.168861  0.000232
soft    0.053318  0.104599
knn     0.050061  0.106058
brits   0.080178  0.015581
mai     0.011505  0.003260


# Sine

In [7]:
def load_arguments(config_name):
    home = os.getcwd()
    
    # Load the config.json
    config_dir = os.path.join(home, config_name)

    with open(config_dir, 'r') as f:
        config_dict = json.load(fp=f)

    # Maintain dirs
    storage_dir = os.path.join(home, 'storage')
    experiment_dir = os.path.join(storage_dir, config_dict['experiment_name'])
    config_dict['experiment_dir'] = experiment_dir


    args = argparse.Namespace(**config_dict)

    return args

In [8]:
def summary_by_metrics(experiment_name):
    home = os.getcwd()
    args = load_arguments('sine_config.json')
    factors = extract_factors(args.ts_size)
    metrics_list = []

    hyper_names = ['mask_size', 'num_masks']
    mse_list = list()
    mae_list = list()
    
    for mask_size in factors:
        for num_masks in range(1, min(2, int(args.ts_size // mask_size))):
            # Find the right config for the instance
            hyper_values = [mask_size, num_masks]
            hyper_comb = dict(zip(hyper_names, hyper_values))
            instance_name = ','.join(['%s=%.4g' % (name, value) if isinstance(value, float)
                                      else '%s=%s' % (name, str(value).replace(' ', '_'))
                                      for name, value in hyper_comb.items()])
            
            storage_dir = os.path.join(home, 'storage')
            experiment_dir = os.path.join(storage_dir, experiment_name)
            instance_dir = os.path.join(args.experiment_dir, instance_name)
            impute_dir = os.path.join(instance_dir, 'impute')
            args.impute_dir = impute_dir
            
            data_mean_impute = load_under_impute(args, 'mean_impute.npy')
            data_median_impute = load_under_impute(args, 'median_impute.npy')
            data_knn_impute = load_under_impute(args, 'knn_impute.npy')
            data_soft_impute = load_under_impute(args, 'soft_impute.npy')
            data_mai_impute = load_under_impute(args, 'mai_impute.npy')
            data_brits_impute = load_under_impute(args, 'brits_impute.npy')
            
            data_original = load_under_impute(args, 'ori_impute.npy')
            masks = load_under_impute(args, 'masks.npy')

            mse_mean_impute = mse_loss(data_mean_impute, data_original, masks)
            mse_median_impute = mse_loss(data_median_impute, data_original, masks)
            mse_knn_impute = mse_loss(data_knn_impute, data_original, masks)
            mse_soft_impute = mse_loss(data_soft_impute, data_original, masks)
            mse_mai_impute = mse_loss(data_mai_impute, data_original, masks)
            mse_brits_impute = mse_loss(data_brits_impute, data_original, masks)

            mae_mean_impute = mean_absolute_error(data_mean_impute[masks], data_original[masks])
            mae_median_impute = mean_absolute_error(data_median_impute[masks], data_original[masks])
            mae_knn_impute = mean_absolute_error(data_knn_impute[masks], data_original[masks])
            mae_soft_impute = mean_absolute_error(data_soft_impute[masks], data_original[masks])
            mae_mai_impute = mean_absolute_error(data_mai_impute[masks], data_original[masks])
            mae_brits_impute = mean_absolute_error(data_brits_impute[masks], data_original[masks])
            
            mse = [mse_mean_impute, mse_median_impute, mse_soft_impute, mse_knn_impute, mse_brits_impute, mse_mai_impute]
            mae = [mae_mean_impute, mae_median_impute, mae_soft_impute, mae_knn_impute, mae_brits_impute, mae_mai_impute]
            
            metrics_dict = {"mse": mse, 
                            "mae": mae}
            metrics_df = pd.DataFrame(metrics_dict, index=['mean', 'median', 'soft', 'knn', 'brits', 'mai'])
            save_under_impute(args, 'impute_metrics.npy', metrics_df)
            instance_name_short = f's{mask_size}n{num_masks}'
            # print(f'{instance_name_short} results:\n{metrics_df}\n')
            
            metrics_np = load_under_impute(args, 'impute_metrics.npy')
            temp_mse = metrics_np[:, 0].reshape(1, 6)
            temp_mae = metrics_np[:, 1].reshape(1, 6)
            mse_df = pd.DataFrame(temp_mse, columns=['mean', 'median', 'soft', 'knn', 'brits', 'mai'], index=[instance_name_short])
            mae_df = pd.DataFrame(temp_mae, columns=['mean', 'median', 'soft', 'knn', 'brits', 'mai'], index=[instance_name_short])
            mse_list.append(mse_df)
            mae_list.append(mae_df)
            
    mse_results = pd.concat(mse_list)
    mae_results = pd.concat(mae_list)
    return mse_results, mae_results
            

In [9]:
mse_results, mae_results = summary_by_metrics('sine_mask_ratio')

In [10]:
mse_mean = mse_results.mean()
mse_std = mse_results.std()
mse_summary = pd.concat([mse_mean, mse_std], axis=1)
mse_summary.columns = ['mse_mean', 'mse_std']
print(mse_summary)

        mse_mean   mse_std
mean    0.054078  0.000275
median  0.058664  0.000405
soft    0.055536  0.129065
knn     0.049562  0.131116
brits   0.024658  0.026758
mai     0.006909  0.005930


In [11]:
mae_mean = mae_results.mean()
mae_std = mae_results.std()
mae_summary = pd.concat([mae_mean, mae_std], axis=1)
mae_summary.columns = ['mae_mean', 'mae_std']
print(mae_summary)

        mae_mean   mae_std
mean    0.185266  0.000522
median  0.180327  0.000464
soft    0.106553  0.175541
knn     0.071565  0.186300
brits   0.090853  0.057934
mai     0.049477  0.021355


# Energy

In [12]:
def summary_by_metrics(experiment_name):
    home = os.getcwd()
    args = load_arguments('energy_config.json')
    factors = extract_factors(args.ts_size)
    metrics_list = []

    hyper_names = ['mask_size', 'num_masks']
    mse_list = list()
    mae_list = list()
    
    for mask_size in factors:
        for num_masks in range(1, min(int(args.ts_size // mask_size), 10)):
            # Find the right config for the instance
            hyper_values = [mask_size, num_masks]
            hyper_comb = dict(zip(hyper_names, hyper_values))
            instance_name = ','.join(['%s=%.4g' % (name, value) if isinstance(value, float)
                                      else '%s=%s' % (name, str(value).replace(' ', '_'))
                                      for name, value in hyper_comb.items()])
            
            storage_dir = os.path.join(home, 'storage')
            experiment_dir = os.path.join(storage_dir, experiment_name)
            instance_dir = os.path.join(args.experiment_dir, instance_name)
            impute_dir = os.path.join(instance_dir, 'impute')
            args.impute_dir = impute_dir
            
            data_mean_impute = load_under_impute(args, 'mean_impute.npy')
            data_median_impute = load_under_impute(args, 'median_impute.npy')
            data_knn_impute = load_under_impute(args, 'knn_impute.npy')
            data_soft_impute = load_under_impute(args, 'soft_impute.npy')
            data_mai_impute = load_under_impute(args, 'mai_impute.npy')
            data_brits_impute = load_under_impute(args, 'brits_impute.npy')
            
            data_original = load_under_impute(args, 'ori_impute.npy')
            masks = load_under_impute(args, 'masks.npy')

            mse_mean_impute = mse_loss(data_mean_impute, data_original, masks)
            mse_median_impute = mse_loss(data_median_impute, data_original, masks)
            mse_knn_impute = mse_loss(data_knn_impute, data_original, masks)
            mse_soft_impute = mse_loss(data_soft_impute, data_original, masks)
            mse_mai_impute = mse_loss(data_mai_impute, data_original, masks)
            mse_brits_impute = mse_loss(data_brits_impute, data_original, masks)

            mae_mean_impute = mean_absolute_error(data_mean_impute[masks], data_original[masks])
            mae_median_impute = mean_absolute_error(data_median_impute[masks], data_original[masks])
            mae_knn_impute = mean_absolute_error(data_knn_impute[masks], data_original[masks])
            mae_soft_impute = mean_absolute_error(data_soft_impute[masks], data_original[masks])
            mae_mai_impute = mean_absolute_error(data_mai_impute[masks], data_original[masks])
            mae_brits_impute = mean_absolute_error(data_brits_impute[masks], data_original[masks])
            
            mse = [mse_mean_impute, mse_median_impute, mse_soft_impute, mse_knn_impute, mse_brits_impute, mse_mai_impute]
            mae = [mae_mean_impute, mae_median_impute, mae_soft_impute, mae_knn_impute, mae_brits_impute, mae_mai_impute]
            
            metrics_dict = {"mse": mse, 
                            "mae": mae}
            metrics_df = pd.DataFrame(metrics_dict, index=['mean', 'median', 'soft', 'knn', 'brits', 'mai'])
            save_under_impute(args, 'impute_metrics.npy', metrics_df)
            instance_name_short = f's{mask_size}n{num_masks}'
            # print(f'{instance_name_short} results:\n{metrics_df}\n')
            
            metrics_np = load_under_impute(args, 'impute_metrics.npy')
            temp_mse = metrics_np[:, 0].reshape(1, 6)
            temp_mae = metrics_np[:, 1].reshape(1, 6)
            mse_df = pd.DataFrame(temp_mse, columns=['mean', 'median', 'soft', 'knn', 'brits', 'mai'], index=[instance_name_short])
            mae_df = pd.DataFrame(temp_mae, columns=['mean', 'median', 'soft', 'knn', 'brits', 'mai'], index=[instance_name_short])
            mse_list.append(mse_df)
            mae_list.append(mae_df)
            
    mse_results = pd.concat(mse_list)
    mae_results = pd.concat(mae_list)
    return mse_results, mae_results

In [13]:
mse_results, mae_results = summary_by_metrics('energy_mask_ratio')

In [14]:
mse_mean = mse_results.mean()
mse_std = mse_results.std()
mse_summary = pd.concat([mse_mean, mse_std], axis=1)
mse_summary.columns = ['mse_mean', 'mse_std']
print(mse_summary)

        mse_mean   mse_std
mean    0.034793  0.000021
median  0.035361  0.000024
soft    0.051196  0.086141
knn     0.043399  0.084359
brits   0.009238  0.005403
mai     0.008874  0.000965


In [15]:
mae_mean = mae_results.mean()
mae_std = mae_results.std()
mae_summary = pd.concat([mae_mean, mae_std], axis=1)
mae_summary.columns = ['mae_mean', 'mae_std']
print(mae_summary)

        mae_mean   mae_std
mean    0.144116  0.000042
median  0.141666  0.000046
soft    0.120548  0.149576
knn     0.092034  0.143274
brits   0.041971  0.024164
mai     0.036580  0.007327
