# Optimization Engineering

In [1]:
from ml_tools.utilities import (select_features_by_shap, 
                                load_dataframe, 
                                save_dataframe,
                                merge_dataframes,
                                serialize_object,
                                distribute_dataset_by_target)
from ml_tools.path_manager import list_csv_paths
from helpers.constants import OPTIMIZATION_TARGETS
from paths import PM
from ml_tools.data_exploration import (summarize_dataframe,
                                       drop_macro,
                                       split_features_targets,
                                       split_continuous_binary,
                                       plot_correlation_heatmap)

## 1. Load dataframe with important features only

In [2]:
chosen_features = select_features_by_shap(root_directory=PM["train metrics"], shap_threshold=1.0)


🐉 2025-10-10 10:37 [✅ [32mINFO[0m] - Starting feature selection with SHAP threshold >= 1.0[0m

🐉 2025-10-10 10:37 [✅ [32mINFO[0m] - Found 6 SHAP summary files to process.[0m

🐉 2025-10-10 10:37 [✅ [32mINFO[0m] - Selected 144 unique features across all files.[0m


In [3]:
all_columns = chosen_features + OPTIMIZATION_TARGETS

for df_name, df_path in list_csv_paths(directory=PM["mice datasets"], verbose=False).items():
    df_raw, _ = load_dataframe(df_path=df_path, use_columns=all_columns, kind="pandas")
    break


🐉 2025-10-10 10:37 [✅ [32mINFO[0m] - 💾 Loaded PANDAS dataset: 'engineered_dataset_clip_MICE' with shape: (3282, 150)[0m


In [4]:
summarize_dataframe(df_raw)

DataFrame Shape: (3282, 150)


Unnamed: 0,Data Type,Non-Null Count,Unique Values,Missing %,mean,std,min,25%,50%,75%,max
Epoxy/Curing Ratio,float64,3282,346,0.00,6.46,14.69,1.00,1.81,3.46,4.63,200.00
Carbon Fiber(%),float64,3282,32,0.00,13.06,24.16,0.00,0.00,0.00,0.00,70.00
Filler Proportion(%),float64,3282,168,0.00,20.17,23.30,0.00,4.00,10.00,30.00,85.00
Temperature(K),float64,3282,56,0.00,366.67,35.16,298.15,353.15,363.15,393.15,473.15
Accelerator_2-mi,float64,3282,2,0.00,0.01,0.11,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...
Flexural Modulus(MPa),float64,877,679,73.28,12058.86,28412.67,0.62,2483.06,3000.00,3990.00,190990.00
Impact Strength(kJ/m2),float64,1214,767,63.01,38.72,74.21,0.00,9.25,17.00,33.50,400.00
Young Modulus(MPa),float64,1044,802,68.19,2941.05,7998.55,0.08,1011.25,2080.00,3010.00,217300.00
Tensile Strength(MPa),float64,2602,1647,20.72,116.02,330.50,0.13,42.30,60.10,76.68,6000.00


## 2. Clean dataset

In [5]:
df_clean = drop_macro(df=df_raw,
                      log_directory=PM["optimization engineering metrics"],
                      targets=OPTIMIZATION_TARGETS,
                      skip_targets=True)


🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Missing_Data_start.csv' with shape: (6, 3)[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - 🧹 Dropped 0 constant columns.[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - No rows found where all targets are missing.[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - No rows exceed the 70% missing feature data threshold.[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - No columns have more than 70% missing data.[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Missing_Data_final.csv' with shape: (6, 3)[0m


## 3. Split datasets

In [6]:
df_features, df_targets = split_features_targets(df=df_clean, targets=OPTIMIZATION_TARGETS)

Original shape: (3282, 150)
Features shape: (3282, 144)
Targets shape: (3282, 6)


In [7]:
df_continuous, df_binary = split_continuous_binary(df=df_features)

Continuous columns shape: (3282, 4)
Binary columns shape: (3282, 140)


### 3.1 Save feature columns

In [8]:
serialize_object(obj=df_continuous.columns.to_list(), save_dir=PM["optimization continuous columns"].parent, filename=PM["optimization continuous columns"].name)


🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Object of type '<class 'list'>' saved to '/home/karl/Epoxy_Design/data/Optimization Engineering/CONTINUOUS_COLUMNS_list.joblib'[0m


In [9]:
serialize_object(obj=df_binary.columns.to_list(), save_dir=PM["optimization binary columns"].parent, filename=PM["optimization binary columns"].name)


🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Object of type '<class 'list'>' saved to '/home/karl/Epoxy_Design/data/Optimization Engineering/BINARY_COLUMNS_list.joblib'[0m


## 4. Plot correlation heatmaps

In [None]:
plot_correlation_heatmap(df=df_continuous, save_dir=PM["optimization engineering metrics"], plot_title="Continuous Features Correlation Heatmap")

In [None]:
plot_correlation_heatmap(df=df_binary, save_dir=PM["optimization engineering metrics"], plot_title="Binary Features Correlation Heatmap")

## 5. Distribute and Save Datasets

In [10]:
# To make sure the column order is correct
df_final = merge_dataframes(df_continuous, df_binary, df_targets, direction="horizontal")


🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Merged DataFrame shape: (3282, 150)[0m


➡️ DataFrame 1 shape: (3282, 4)
➡️ DataFrame 2 shape: (3282, 140)
➡️ DataFrame 3 shape: (3282, 6)


In [11]:
dataset_iterator = distribute_dataset_by_target(df_or_path=df_final, target_columns=OPTIMIZATION_TARGETS)

for target_name, df_target in dataset_iterator:
    save_dataframe(df=df_target, save_dir=PM["optimization engineering"], filename=target_name)


🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Flexural_StrengthMPa.csv' with shape: (1643, 145)[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Flexural_ModulusMPa.csv' with shape: (877, 145)[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Impact_StrengthkJm2.csv' with shape: (1214, 145)[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Young_ModulusMPa.csv' with shape: (1044, 145)[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Tensile_StrengthMPa.csv' with shape: (2602, 145)[0m

🐉 2025-10-10 10:38 [✅ [32mINFO[0m] - Saved dataset: 'Elongation_at_Break.csv' with shape: (1246, 145)[0m
