# Time Consumption

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "data-cleaning-stability":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/Research/NYU/ML_Lifecycle_Project/Code/data-cleaning-stability


In [4]:
import pandas as pd
from tabulate import tabulate
from source.visualizations.imputers_viz import get_data_for_box_plots_for_diff_imputers_and_datasets_for_mixed_exp
from configs.constants import (ACS_INCOME_DATASET, ACS_EMPLOYMENT_DATASET, LAW_SCHOOL_DATASET, GERMAN_CREDIT_DATASET,
                               CARDIOVASCULAR_DISEASE_DATASET, BANK_MARKETING_DATASET, DIABETES_DATASET, IMPUTERS_ORDER)

## Define global configs

In [5]:
from source.custom_classes.database_client import DatabaseClient, get_secrets_path

db_client_1 = DatabaseClient()
db_client_3 = DatabaseClient(secrets_path=get_secrets_path('secrets_3.env'))
db_client_1.connect()
db_client_3.connect()

In [6]:
DATASETS_ALL_SENSITIVE_ATTRS = {
    ACS_INCOME_DATASET: ['SEX', 'RAC1P', 'SEX&RAC1P'],
    LAW_SCHOOL_DATASET: ['male', 'race', 'male&race'],
    GERMAN_CREDIT_DATASET: ['sex', 'age', 'sex&age'],
    CARDIOVASCULAR_DISEASE_DATASET: ['gender'],
    BANK_MARKETING_DATASET: ['age'],
    DIABETES_DATASET: ['Gender'],
    ACS_EMPLOYMENT_DATASET: ['SEX', 'RAC1P', 'SEX&RAC1P'],
}

DATASETS_SENSITIVE_ATTRS = {
    ACS_INCOME_DATASET: 'SEX&RAC1P',
    LAW_SCHOOL_DATASET: 'male&race',
    GERMAN_CREDIT_DATASET: 'sex',
    CARDIOVASCULAR_DISEASE_DATASET: 'gender',
    BANK_MARKETING_DATASET: 'age',
    DIABETES_DATASET: 'Gender',
    ACS_EMPLOYMENT_DATASET: 'SEX&RAC1P',
}

DATASET_TO_COLUMN_NAME = {
    DIABETES_DATASET: {'cat': ['Family_Diabetes', 'PhysicallyActive', 'RegularMedicine'], 'num': ['SoundSleep']},
    GERMAN_CREDIT_DATASET: {'cat': ['checking-account', 'savings-account', 'employment-since'], 'num': ['duration', 'credit-amount']},
    ACS_INCOME_DATASET: {'cat': ['SCHL', 'MAR'], 'num': ['AGEP', 'WKHP']},
    LAW_SCHOOL_DATASET: {'cat': ['fam_inc', 'tier'], 'num': ['zfygpa', 'ugpa']},
    BANK_MARKETING_DATASET: {'cat': ['education', 'job'], 'num': ['balance', 'campaign']},
    CARDIOVASCULAR_DISEASE_DATASET: {'cat': ['cholesterol', 'gluc'], 'num': ['weight', 'height']},
    ACS_EMPLOYMENT_DATASET: {'cat': ['SCHL', 'DIS', 'MIL'], 'num': ['AGEP']},
}

## Metric Visualizations

In [7]:
from source.custom_classes.benchmark import Benchmark


def get_mean_sample_size():
    # experiment_seeds = [100, 200, 300, 400, 500, 600]
    experiment_seed = 100
    dataset_names = [DIABETES_DATASET, GERMAN_CREDIT_DATASET, ACS_INCOME_DATASET, LAW_SCHOOL_DATASET,
                     BANK_MARKETING_DATASET, CARDIOVASCULAR_DISEASE_DATASET, ACS_EMPLOYMENT_DATASET]
    evaluation_scenarios = ['mixed_exp', 'exp1_mcar3', 'exp1_mar3', 'exp1_mnar3']

    rows_without_nulls_dct = dict()
    for dataset_name in dataset_names:
        rows_without_nulls_dct[dataset_name] = []
        for evaluation_scenario in evaluation_scenarios:
            # for experiment_seed in experiment_seeds:
            benchmark = Benchmark(dataset_name=dataset_name,
                                  null_imputers=[],
                                  model_names=[])
            X_train_val, X_test, y_train_val, y_test = benchmark._split_dataset(benchmark.init_data_loader, experiment_seed)
            X_train_val_with_nulls, _ = benchmark._inject_nulls(X_train_val=X_train_val,
                                                                X_test=X_test,
                                                                evaluation_scenario=evaluation_scenario,
                                                                experiment_seed=experiment_seed)
            rows_without_nulls = len(X_train_val_with_nulls.dropna())
            rows_without_nulls_dct[dataset_name].append(rows_without_nulls)

    avg_rows_without_nulls_dct = {k: sum(v) / len(v) for k, v in rows_without_nulls_dct.items()}
    return avg_rows_without_nulls_dct


def get_training_sample_size():
    experiment_seed = 100
    dataset_names = [DIABETES_DATASET, GERMAN_CREDIT_DATASET, ACS_INCOME_DATASET, LAW_SCHOOL_DATASET,
                     BANK_MARKETING_DATASET, CARDIOVASCULAR_DISEASE_DATASET, ACS_EMPLOYMENT_DATASET]

    training_set_shape_dct = dict()
    for dataset_name in dataset_names:
        training_set_shape_dct[dataset_name] = []

        # for experiment_seed in experiment_seeds:
        benchmark = Benchmark(dataset_name=dataset_name,
                              null_imputers=[],
                              model_names=[])
        X_train_val, X_test, y_train_val, y_test = benchmark._split_dataset(benchmark.init_data_loader, experiment_seed)
        training_set_shape_dct[dataset_name] = X_train_val.shape

    return training_set_shape_dct

In [8]:
# avg_rows_without_nulls_dct = get_mean_sample_size()

In [9]:
training_set_shape_dct = get_training_sample_size()
training_set_shape_dct

Session UUID for all results of experiments in the current benchmark session: 65eb9cbe-d523-11ef-b480-ae7d8bf09116
Session UUID for all results of experiments in the current benchmark session: 65ecb7ca-d523-11ef-b480-ae7d8bf09116
Session UUID for all results of experiments in the current benchmark session: 66c286f2-d523-11ef-b480-ae7d8bf09116
Session UUID for all results of experiments in the current benchmark session: 66c8b978-d523-11ef-b480-ae7d8bf09116
Session UUID for all results of experiments in the current benchmark session: 66d68ef4-d523-11ef-b480-ae7d8bf09116
Session UUID for all results of experiments in the current benchmark session: 66ead120-d523-11ef-b480-ae7d8bf09116
Session UUID for all results of experiments in the current benchmark session: 6b038d6a-d523-11ef-b480-ae7d8bf09116


{'diabetes': (633, 17),
 'german': (700, 21),
 'folk': (12000, 10),
 'law_school': (16638, 11),
 'bank': (32003, 13),
 'heart': (56000, 11),
 'folk_emp': (242112, 16)}

In [10]:
def get_imputer_time_data(missingness_types: list, dataset_to_column_name: dict, imputation_quality_metric_name: str,
                          db_client_1, db_client_3, dataset_to_group: dict = None, without_dummy: bool = False):
    imputation_quality_metrics_df = pd.DataFrame()
    for missingness_type in missingness_types:
        train_injection_scenario, test_injection_scenario = missingness_type['train'], missingness_type['test']
        if train_injection_scenario != 'mixed_exp':
            train_injection_scenario = train_injection_scenario.upper()
        test_injection_scenario = test_injection_scenario.upper()

        db_client = db_client_3 if train_injection_scenario == 'mixed_exp' else db_client_1
        imputation_quality_metrics_sub_df, _ = (
            get_data_for_box_plots_for_diff_imputers_and_datasets_for_mixed_exp(train_injection_scenario=train_injection_scenario,
                                                                                test_injection_scenario=test_injection_scenario,
                                                                                metric_name=imputation_quality_metric_name,
                                                                                dataset_to_column_name=dataset_to_column_name,
                                                                                db_client=db_client,
                                                                                dataset_to_group=dataset_to_group,
                                                                                without_dummy=without_dummy))

        imputation_quality_metrics_sub_df['Missingness_Type'] = train_injection_scenario + ' - ' + test_injection_scenario
        imputation_quality_metrics_df = pd.concat([imputation_quality_metrics_df, imputation_quality_metrics_sub_df])

        print(f'Extraction for {missingness_type} is completed\n\n')

    return imputation_quality_metrics_df

In [11]:
imputation_quality_metrics_df = get_imputer_time_data(missingness_types=[
                                                         {'train': 'MCAR3', 'test': 'MCAR3'},
                                                         {'train': 'MAR3', 'test': 'MAR3'},
                                                         {'train': 'MNAR3', 'test': 'MNAR3'},
                                                         {'train': 'mixed_exp', 'test': 'MCAR1 & MAR1 & MNAR1'}
                                                      ],
                                                      dataset_to_column_name=DATASET_TO_COLUMN_NAME,
                                                      imputation_quality_metric_name='runtime_in_mins',
                                                      db_client_1=db_client_1,
                                                      db_client_3=db_client_3)

Extracted data for german
Extracted data for bank
Extracted data for heart
Extracted data for diabetes
Extracted data for law_school
Extracted data for folk
Extracted data for folk_emp
Extraction for {'train': 'mixed_exp', 'test': 'MCAR1 & MAR1 & MNAR1'} is completed




In [12]:
imputation_quality_metrics_df.head()

Unnamed: 0,Dataset_Name,Null_Imputer_Name,Evaluation_Scenario,Experiment_Seed,Dataset_Part,Sample_Size,Kl_Divergence_Pred,Kl_Divergence_Total,Rmse,Precision,Recall,F1_Score,Runtime_In_Mins,Null_Imputer_Params_Dct,Missingness_Type
0,bank,automl,mixed_exp,100,X_test_MCAR1 & MAR1 & MNAR1,639.5,1.045818,0.001907,,0.490005,0.490005,0.490005,239.620521,,mixed_exp - MCAR1 & MAR1 & MNAR1
1,bank,automl,mixed_exp,200,X_test_MCAR1 & MAR1 & MNAR1,622.0,1.01426,0.002607,,0.495821,0.495821,0.495821,212.400844,,mixed_exp - MCAR1 & MAR1 & MNAR1
2,bank,automl,mixed_exp,300,X_test_MCAR1 & MAR1 & MNAR1,633.0,2.285223,0.002284,,0.502379,0.502379,0.502379,187.875222,,mixed_exp - MCAR1 & MAR1 & MNAR1
3,bank,automl,mixed_exp,400,X_test_MCAR1 & MAR1 & MNAR1,635.0,2.313469,0.002266,,0.489497,0.489497,0.489497,201.724812,,mixed_exp - MCAR1 & MAR1 & MNAR1
4,bank,automl,mixed_exp,500,X_test_MCAR1 & MAR1 & MNAR1,627.5,2.340687,0.002203,,0.515463,0.515463,0.515463,225.469161,,mixed_exp - MCAR1 & MAR1 & MNAR1


In [13]:
imputation_quality_metrics_df.shape

(587, 15)

In [14]:
imputation_quality_metrics_df['Runtime_In_Secs'] = imputation_quality_metrics_df['Runtime_In_Mins'] * 60.0
imp_runtime_df = (
    imputation_quality_metrics_df[
        ['Dataset_Name', 'Null_Imputer_Name', 'Evaluation_Scenario',
         'Experiment_Seed', 'Runtime_In_Secs', 'Missingness_Type']
    ].groupby(['Dataset_Name', 'Null_Imputer_Name', 'Missingness_Type'])
    .agg(mean_runtime=('Runtime_In_Secs', 'mean'), std_runtime=('Runtime_In_Secs', 'std'))
    .groupby(['Dataset_Name', 'Null_Imputer_Name'])
    .agg(avg_mean_runtime=('mean_runtime', 'mean'), avg_std_runtime=('std_runtime', 'mean'))
    .reset_index()
)

In [15]:
imp_runtime_df.shape

(98, 4)

In [16]:
imp_runtime_df.head(20)

Unnamed: 0,Dataset_Name,Null_Imputer_Name,avg_mean_runtime,avg_std_runtime
0,bank,automl,12888.432229,1099.781717
1,bank,datawig,5892.51857,480.435842
2,bank,deletion,0.039361,0.000616
3,bank,edit_gain,29.035677,1.959976
4,bank,gain,2106.398085,38.569102
5,bank,hivae,2405.321782,111.218301
6,bank,k_means_clustering,3914.635583,194.343107
7,bank,median-dummy,0.021424,0.000636
8,bank,median-mode,0.024178,0.000484
9,bank,miss_forest,3966.557256,1017.057177


In [17]:
# Pivot the DataFrame to organize datasets on X-axis and imputers on Y-axis
pivot_table = imp_runtime_df.pivot(
    index='Null_Imputer_Name',
    columns='Dataset_Name',
    values=['avg_mean_runtime', 'avg_std_runtime']
)

In [18]:
# Sort by values in the column ('avg_mean_runtime', 'folk_emp')
sorted_pivot_table = pivot_table.sort_values(by=('avg_mean_runtime', 'folk_emp'))
sorted_imputer_names = sorted_pivot_table.index
print(sorted_imputer_names)

Index(['median-dummy', 'median-mode', 'deletion', 'mnar_pvae', 'edit_gain',
       'nomi', 'tdm', 'notmiwae', 'gain', 'hivae', 'k_means_clustering',
       'miss_forest', 'datawig', 'automl'],
      dtype='object', name='Null_Imputer_Name')


In [19]:
# Sort X-axis (Datasets) and Y-axis (Imputers)
x_axis_order = ['diabetes', 'german', 'folk', 'law_school', 'bank', 'heart', 'folk_emp']
# y_axis_order = IMPUTERS_ORDER
y_axis_order = sorted_imputer_names

# Pivot the table and reindex
pivot_table = pivot_table.reindex(index=y_axis_order,
                                  columns=pd.MultiIndex.from_product([['avg_mean_runtime', 'avg_std_runtime'], x_axis_order]))

# Escape underscores in LaTeX
def escape_latex(text):
    return text.replace("_", "$\\_$")

# Generate LaTeX table with sorting, escaping, rounding, and bold formatting for each column
latex_data = []
columns = ['\\textbf{Imputer}'] + [
    '\\makecell[tl]{\\textbf{' + escape_latex("folk_inc" if col == "folk" else col) + '}' + '\\\\' +
    '\\textbf{' + f"({training_set_shape_dct[col][0]}, {training_set_shape_dct[col][1]})" + '}}'
    for col in x_axis_order
]

# Generate rows with bold formatting for minimum values
for imputer in pivot_table.index:
    row = [escape_latex(imputer)]

    for dataset in x_axis_order:
        mean_val = pivot_table.loc[imputer, ('avg_mean_runtime', dataset)]
        std_val = pivot_table.loc[imputer, ('avg_std_runtime', dataset)]
        if pd.notnull(mean_val) and pd.notnull(std_val):
            # Apply custom rounding
            mean_str = f"{int(mean_val):d}" if mean_val > 1 else f"{mean_val:.3f}"
            std_str = f"{int(std_val):d}" if std_val > 1 else f"{std_val:.3f}"
            row.append(f"${mean_str} \\pm {std_str}$")
        else:
            row.append("-")
    latex_data.append(row)

In [20]:
# Use tabulate to generate LaTeX code
latex_table = tabulate(latex_data, headers=columns, tablefmt="latex_raw")

# Output the LaTeX table code
print("Generated LaTeX Table:")
print(latex_table)

Generated LaTeX Table:
\begin{tabular}{llllllll}
\hline
 \textbf{Imputer}         & \makecell[tl]{\textbf{diabetes}\\\textbf{(633, 17)}}   & \makecell[tl]{\textbf{german}\\\textbf{(700, 21)}}   & \makecell[tl]{\textbf{folk$\_$inc}\\\textbf{(12000, 10)}}   & \makecell[tl]{\textbf{law$\_$school}\\\textbf{(16638, 11)}}   & \makecell[tl]{\textbf{bank}\\\textbf{(32003, 13)}}   & \makecell[tl]{\textbf{heart}\\\textbf{(56000, 11)}}   & \makecell[tl]{\textbf{folk$\_$emp}\\\textbf{(242112, 16)}}   \\
\hline
 median-dummy             & $0.009 \pm 0.000$                                      & $0.010 \pm 0.001$                                    & $0.019 \pm 0.002$                                           & $0.021 \pm 0.000$                                             & $0.021 \pm 0.001$                                    & $0.040 \pm 0.001$                                     & $0.590 \pm 0.007$                                            \\
 median-mode              & $0.009 \pm 0.000$          