In [1]:
import pandas as pd
from train_pipeline import train_model

def print_nested_keys(d, prefix=''):
    if isinstance(d, dict):
        for key in d:
            full_key = f"{prefix}.{key}" if prefix else key
            print(full_key)
            print_nested_keys(d[key], full_key)

def print_nested_keys_and_values(d, prefix=''):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key)
    else:
        print(f"{prefix}:\n{d}\n")

def print_nested_keys_and_values(d, prefix='', filter_key=None):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key, filter_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key, filter_key)
    else:
        if filter_key is None or prefix.endswith(filter_key):
            print(f"{prefix}:\n{d}\n")


In [2]:
from data_parser import DataParser
# feats_name = "all_feats"
feats_name = "reduced_feats"
results = {}
# Metrics configurations
parser = DataParser()
all_feats = parser.feats_dict[feats_name]
feature_to_remove = all_feats.copy()
feature_to_remove.remove(parser.target)
feature_to_remove = ([x for x in feature_to_remove if x not in parser.reduced_feats]  # when using all features, include additional features not in reduced feats
                     + [ "wf_disp_1", "wf_bind_1", "temp_cold", "time_sinter_1", "temp_sinter_1"])  # "dia_part_1",

# Features with high null percentage values
# features_to_remove = ["wf_disp_1", "wf_bind_1", "temp_cold"]


model_name = "catb_onehot[selected]"
selected_feats = all_feats.copy()
target = parser.target

## Selected Features Model

In [3]:
results[f"selected_features"] = train_model(model_name=model_name, feats_name=feats_name, shap_opt=0, selected_feats=selected_feats)


Parsing data...
Using provided selected features: ['name_part1', 'name_fluid1', 'material_group', 'wf_disp_1', 'wf_bind_1', 'temp_cold', 'time_sinter_1', 'temp_sinter_1', 'vf_total', 'porosity']
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_

## Removing 1 feature

In [4]:
results_1 = {}
for feat in feature_to_remove:
    selected_feats.remove(feat)
    print(selected_feats)
    results_1[f"{feat}"] = train_model(model_name=model_name, feats_name=feats_name, shap_opt=0, selected_feats=selected_feats)
    selected_feats = all_feats.copy()

['name_part1', 'name_fluid1', 'material_group', 'wf_bind_1', 'temp_cold', 'time_sinter_1', 'temp_sinter_1', 'vf_total']
Parsing data...
Using provided selected features: ['name_part1', 'name_fluid1', 'material_group', 'wf_bind_1', 'temp_cold', 'time_sinter_1', 'temp_sinter_1', 'vf_total']
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Search space: [{'model__n_estimators': [250, 500, 1000]

## Removing 2-features combinations

In [18]:
import itertools
results_2 = {}
model_name = "catb_onehot[selected]"
n = 2  # Number of features to remove together
selected_feats = all_feats.copy()  # Reset to all features
# Iterate over all combinations of `n` features to remove
for combo in itertools.combinations(feature_to_remove, n):
    selected_feats = [f for f in all_feats if f not in combo]  # remove the combo
    feat_label = "__".join(combo)
    print(f"Training without: {combo}")

    results_2[f"{feat_label}"] = train_model(
        model_name=model_name,
        feats_name=feats_name,
        shap_opt=0,
        selected_feats=selected_feats + [target]  # target must be included
    )

KeyboardInterrupt: 

## Removing 3 features

In [6]:
import itertools
results_3 = {}
model_name = "catb_onehot[selected]"
n = 3  # Number of features to remove together

# # Iterate over all combinations of `n` features to remove
# for combo in itertools.combinations(feature_to_remove, n):
#     selected_feats = [f for f in all_feats
#     if f not in combo]  # remove the combo
#     feat_label = "__".join(combo)
#     print(f"Training without: {combo}")
#
#     results_3[f"{feat_label}"] = train_model(
#         model_name=model_name,
#         feats_name=feats_name,
#         shap_opt=0,
#         selected_feats=selected_feats + [target]  # target must be included
#     )

## Results

In [7]:
# 1
dfs = []
for model, data in results_1.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

                pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape          model
1  catb_onehot[selected]     0.91     0.036     0.004     10.001    0.80    0.060    0.008    15.193      wf_bind_1
2  catb_onehot[selected]     0.91     0.037     0.004     10.350    0.80    0.062    0.009    15.641      temp_cold
3  catb_onehot[selected]     0.91     0.038     0.004     10.500    0.80    0.061    0.008    15.355  time_sinter_1
4  catb_onehot[selected]     0.90     0.040     0.004     10.999    0.80    0.062    0.008    16.170  temp_sinter_1
0  catb_onehot[selected]     0.91     0.037     0.004     10.725    0.79    0.062    0.009    15.276      wf_disp_1


In [8]:
# 2
dfs = []
for model, data in results_2.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

                pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape                         model
5  catb_onehot[selected]     0.57     0.098     0.018     32.536    0.35    0.122    0.027    35.746      wf_bind_1__time_sinter_1
6  catb_onehot[selected]     0.48     0.110     0.022     35.400    0.34    0.125    0.028    37.719      wf_bind_1__temp_sinter_1
0  catb_onehot[selected]     0.53     0.101     0.020     32.531    0.29    0.126    0.030    36.585          wf_disp_1__wf_bind_1
2  catb_onehot[selected]     0.53     0.102     0.020     34.495    0.28    0.126    0.030    37.052      wf_disp_1__time_sinter_1
3  catb_onehot[selected]     0.41     0.118     0.025     39.993    0.28    0.131    0.030    40.732      wf_disp_1__temp_sinter_1
4  catb_onehot[selected]     0.49     0.106     0.021     38.103    0.26    0.130    0.031    40.176          wf_bind_1__temp_cold
7  catb_onehot[selected]     0.47     0.107     0.022     39.047    0.25    0.133  

In [9]:
# 3
dfs = []
for model, data in results_3.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

ValueError: No objects to concatenate

In [14]:
from IPython.display import display
import pandas as pd

pd.set_option("display.max_rows", None)  # Show all rows

dfs = []
for model, data in results_2.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
sorted_metrics['pipeline'] = sorted_metrics['model']
display(sorted_metrics)

Unnamed: 0,pipeline,train_r2,train_mae,train_mse,train_mape,test_r2,test_mae,test_mse,test_mape,model
5,wf_bind_1__time_sinter_1,0.57,0.098,0.018,32.536,0.35,0.122,0.027,35.746,wf_bind_1__time_sinter_1
6,wf_bind_1__temp_sinter_1,0.48,0.11,0.022,35.4,0.34,0.125,0.028,37.719,wf_bind_1__temp_sinter_1
0,wf_disp_1__wf_bind_1,0.53,0.101,0.02,32.531,0.29,0.126,0.03,36.585,wf_disp_1__wf_bind_1
2,wf_disp_1__time_sinter_1,0.53,0.102,0.02,34.495,0.28,0.126,0.03,37.052,wf_disp_1__time_sinter_1
3,wf_disp_1__temp_sinter_1,0.41,0.118,0.025,39.993,0.28,0.131,0.03,40.732,wf_disp_1__temp_sinter_1
4,wf_bind_1__temp_cold,0.49,0.106,0.021,38.103,0.26,0.13,0.031,40.176,wf_bind_1__temp_cold
7,temp_cold__time_sinter_1,0.47,0.107,0.022,39.047,0.25,0.133,0.032,39.869,temp_cold__time_sinter_1
1,wf_disp_1__temp_cold,0.46,0.108,0.023,42.69,0.24,0.132,0.032,47.432,wf_disp_1__temp_cold
9,time_sinter_1__temp_sinter_1,0.31,0.126,0.029,45.58,0.19,0.139,0.034,49.272,time_sinter_1__temp_sinter_1
8,temp_cold__temp_sinter_1,0.34,0.124,0.028,42.393,0.18,0.139,0.035,44.462,temp_cold__temp_sinter_1


In [15]:
from IPython.display import display
import pandas as pd

pd.set_option("display.max_rows", None)  # Show all rows

dfs = []
for results in [results, results_1, results_2, results_3]:
    for model, data in results.items():
        if 'overall_metrics' in data:
            df = data['overall_metrics'].copy()
            df['model'] = model
            dfs.append(df)

# Combine into one DataFrame
all_metrics = pd.concat(dfs, ignore_index=True)


# Combine and sort
sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
sorted_metrics['pipeline'] = sorted_metrics['model']

# Extract removed features
removed = sorted_metrics['pipeline'].str.split('__', expand=True)
for i in range(3):  # removed_1, removed_2, removed_3
    sorted_metrics[f'removed_{i+1}'] = removed[i] if i in removed.columns else None

sorted_metrics['n_removed'] = removed.notna().sum(axis=1)
all_metrics = all_metrics.drop(columns='pipeline', errors='ignore')

# Reorder columns
front_cols = ['removed_1', 'removed_2', 'removed_3', 'n_removed', 'test_r2']
remaining_cols = [col for col in sorted_metrics.columns if col not in front_cols]
sorted_metrics = sorted_metrics[front_cols + remaining_cols]

# Display
display(sorted_metrics)


Unnamed: 0,removed_1,removed_2,removed_3,n_removed,test_r2,pipeline,train_r2,train_mae,train_mse,train_mape,test_mae,test_mse,test_mape,model
0,selected_features,,,1,0.81,selected_features,0.92,0.035,0.004,9.793,0.059,0.008,14.969,selected_features
2,wf_bind_1,,,1,0.8,wf_bind_1,0.91,0.036,0.004,10.001,0.06,0.008,15.193,wf_bind_1
3,temp_cold,,,1,0.8,temp_cold,0.91,0.037,0.004,10.35,0.062,0.009,15.641,temp_cold
4,time_sinter_1,,,1,0.8,time_sinter_1,0.91,0.038,0.004,10.5,0.061,0.008,15.355,time_sinter_1
5,temp_sinter_1,,,1,0.8,temp_sinter_1,0.9,0.04,0.004,10.999,0.062,0.008,16.17,temp_sinter_1
1,wf_disp_1,,,1,0.79,wf_disp_1,0.91,0.037,0.004,10.725,0.062,0.009,15.276,wf_disp_1
11,wf_bind_1,time_sinter_1,,2,0.35,wf_bind_1__time_sinter_1,0.57,0.098,0.018,32.536,0.122,0.027,35.746,wf_bind_1__time_sinter_1
12,wf_bind_1,temp_sinter_1,,2,0.34,wf_bind_1__temp_sinter_1,0.48,0.11,0.022,35.4,0.125,0.028,37.719,wf_bind_1__temp_sinter_1
6,wf_disp_1,wf_bind_1,,2,0.29,wf_disp_1__wf_bind_1,0.53,0.101,0.02,32.531,0.126,0.03,36.585,wf_disp_1__wf_bind_1
8,wf_disp_1,time_sinter_1,,2,0.28,wf_disp_1__time_sinter_1,0.53,0.102,0.02,34.495,0.126,0.03,37.052,wf_disp_1__time_sinter_1
