# Optimization Engineering

In [None]:
from ml_tools.utilities import (select_features_by_shap, 
                                load_dataframe, 
                                save_dataframe,
                                merge_dataframes,
                                serialize_object,
                                distribute_dataset_by_target)
from ml_tools.path_manager import list_csv_paths
from helpers.constants import OPTIMIZATION_TARGETS
from paths import PM
from ml_tools.data_exploration import (summarize_dataframe,
                                       drop_macro,
                                       split_features_targets,
                                       split_continuous_binary,
                                       plot_correlation_heatmap)

## 1. Load dataframe with important features only

In [None]:
chosen_features = select_features_by_shap(root_directory=PM["train metrics"], shap_threshold=1.0)

In [None]:
all_columns = chosen_features + OPTIMIZATION_TARGETS

for df_name, df_path in list_csv_paths(directory=PM["mice datasets"], verbose=False).items():
    df_raw, _ = load_dataframe(df_path=df_path, use_columns=all_columns, kind="pandas")
    break

In [None]:
summarize_dataframe(df_raw)

## 2. Clean dataset

In [None]:
df_clean = drop_macro(df=df_raw,
                      log_directory=PM["optimization engineering metrics"],
                      targets=OPTIMIZATION_TARGETS,
                      skip_targets=True)

## 3. Split datasets

In [None]:
df_features, df_targets = split_features_targets(df=df_clean, targets=OPTIMIZATION_TARGETS)

In [None]:
df_continuous, df_binary = split_continuous_binary(df=df_features)

### 3.1 Save feature columns

In [None]:
serialize_object(obj=df_continuous.columns.to_list(), save_dir=PM["optimization continuous columns"].parent, filename=PM["optimization continuous columns"].name)

In [None]:
serialize_object(obj=df_binary.columns.to_list(), save_dir=PM["optimization binary columns"].parent, filename=PM["optimization binary columns"].name)

## 4. Plot correlation heatmaps

In [None]:
plot_correlation_heatmap(df=df_continuous, save_dir=PM["optimization engineering metrics"], plot_title="Continuous Features Correlation Heatmap")

In [None]:
plot_correlation_heatmap(df=df_binary, save_dir=PM["optimization engineering metrics"], plot_title="Binary Features Correlation Heatmap")

## 5. Distribute and Save Datasets

In [None]:
# To make sure the column order is correct
df_final = merge_dataframes(df_continuous, df_binary, df_targets, direction="horizontal")

In [None]:
dataset_iterator = distribute_dataset_by_target(df_or_path=df_final, target_columns=OPTIMIZATION_TARGETS)

for target_name, df_target in dataset_iterator:
    save_dataframe(df=df_target, save_dir=PM["optimization engineering"], filename=target_name)