In [25]:
import pandas as pd
from train_pipeline import train_model

def print_nested_keys(d, prefix=''):
    if isinstance(d, dict):
        for key in d:
            full_key = f"{prefix}.{key}" if prefix else key
            print(full_key)
            print_nested_keys(d[key], full_key)

def print_nested_keys_and_values(d, prefix=''):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key)
    else:
        print(f"{prefix}:\n{d}\n")

def print_nested_keys_and_values(d, prefix='', filter_key=None):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key, filter_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key, filter_key)
    else:
        if filter_key is None or prefix.endswith(filter_key):
            print(f"{prefix}:\n{d}\n")


In [26]:
from data_parser import DataParser
# feats_name = "all_feats"
feats_name = "reduced_feats"
results = {}
# Metrics configurations
FEATURES_TO_ANALYZE = ["material_group", "name_part1", "name_fluid1", "name_part1_novel_in_test", "name_part1_freq_bin", "year"]
COLUMNS_NOT_NULL = ["name_part1", "name_fluid1", "vf_total", "material_group"]
parser = DataParser()
all_feats = parser.feats_dict[feats_name]
feature_to_remove = all_feats.copy()
feature_to_remove.remove(parser.target)
feature_to_remove = ([x for x in feature_to_remove if x not in parser.reduced_feats]  # when using all features, include additional features not in reduced feats
                     + ["dia_part_1", "wf_disp_1", "wf_bind_1", "temp_cold", "time_sinter_1", "temp_sinter_1"])



selected_feats = all_feats.copy()

In [27]:
model_name = "catb_onehot[selected]"

for feat in feature_to_remove:
    selected_feats.remove(feat)
    print(selected_feats)
    results[f"{model_name}_{feat}"] = train_model(model_name=model_name, feats_name="all_feats", shap_opt=0, selected_feats=selected_feats)
    selected_feats = all_feats.copy()

['name_part1', 'name_fluid1', 'material_group', 'wf_disp_1', 'wf_bind_1', 'temp_cold', 'time_sinter_1', 'temp_sinter_1', 'vf_total', 'porosity']
Parsing data...
Using provided selected features: ['name_part1', 'name_fluid1', 'material_group', 'wf_disp_1', 'wf_bind_1', 'temp_cold', 'time_sinter_1', 'temp_sinter_1', 'vf_total', 'porosity']
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Searc

In [28]:
dfs = []
for model, data in results.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

                pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape                                model
0  catb_onehot[selected]     0.92     0.035     0.004      9.793    0.81    0.059    0.008    14.969     catb_onehot[selected]_dia_part_1
2  catb_onehot[selected]     0.92     0.034     0.003      9.507    0.81    0.059    0.008    14.832      catb_onehot[selected]_wf_bind_1
3  catb_onehot[selected]     0.92     0.034     0.003      9.476    0.81    0.059    0.008    15.142      catb_onehot[selected]_temp_cold
4  catb_onehot[selected]     0.92     0.034     0.003      9.705    0.81    0.059    0.008    14.882  catb_onehot[selected]_time_sinter_1
5  catb_onehot[selected]     0.91     0.036     0.004     10.307    0.81    0.058    0.008    14.972  catb_onehot[selected]_temp_sinter_1
1  catb_onehot[selected]     0.91     0.035     0.004     10.475    0.79    0.061    0.009    16.213      catb_onehot[selected]_wf_disp_1


## Combining 2 features to remove together

In [29]:
import itertools
results_2 = {}
model_name = "catb_onehot[selected]"
n = 2  # Number of features to remove together

all_feats = parser.feats_dict["all_feats"]
target = parser.target

# Remove target from feature list for ablation
base_feats = [f for f in all_feats if f != target]

# Iterate over all combinations of `n` features to remove
for combo in itertools.combinations(base_feats, n):
    selected_feats = [f for f in base_feats if f not in combo]  # remove the combo
    feat_label = "__".join(combo)
    print(f"Training without: {combo}")

    results_2[f"{model_name}_rm_{feat_label}"] = train_model(
        model_name=model_name,
        feats_name="all_feats",
        shap_opt=0,
        selected_feats=selected_feats + [target]  # target must be included
    )


Training without: ('name_part1', 'name_part2')
Parsing data...
Using provided selected features: ['name_fluid1', 'dia_part_1', 'material_group', 'name_disp_1', 'wf_disp_1', 'wf_bind_1', 'temp_cold', 'time_sinter_1', 'temp_sinter_1', 'vf_total', 'porosity']
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__ev

In [30]:
from IPython.display import display
import pandas as pd

pd.set_option("display.max_rows", None)  # Show all rows

dfs = []
for model, data in results_2.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
sorted_metrics['pipeline'] = sorted_metrics['model']
display(sorted_metrics)

Unnamed: 0,pipeline,train_r2,train_mae,train_mse,train_mape,test_r2,test_mae,test_mse,test_mape,model
30,catb_onehot[selected]_rm_dia_part_1__material_...,0.91,0.035,0.004,10.071,0.82,0.059,0.008,16.533,catb_onehot[selected]_rm_dia_part_1__material_...
38,catb_onehot[selected]_rm_material_group__name_...,0.92,0.033,0.003,9.661,0.82,0.057,0.008,16.165,catb_onehot[selected]_rm_material_group__name_...
41,catb_onehot[selected]_rm_material_group__temp_...,0.92,0.034,0.004,9.883,0.82,0.057,0.008,16.063,catb_onehot[selected]_rm_material_group__temp_...
40,catb_onehot[selected]_rm_material_group__wf_bi...,0.92,0.033,0.003,9.519,0.82,0.057,0.008,15.402,catb_onehot[selected]_rm_material_group__wf_bi...
28,catb_onehot[selected]_rm_name_fluid1__temp_sin...,0.91,0.036,0.004,10.032,0.81,0.059,0.008,15.377,catb_onehot[selected]_rm_name_fluid1__temp_sin...
22,catb_onehot[selected]_rm_name_fluid1__material...,0.92,0.033,0.003,9.604,0.81,0.058,0.008,15.721,catb_onehot[selected]_rm_name_fluid1__material...
23,catb_onehot[selected]_rm_name_fluid1__name_disp_1,0.92,0.033,0.003,10.082,0.81,0.059,0.008,15.64,catb_onehot[selected]_rm_name_fluid1__name_disp_1
48,catb_onehot[selected]_rm_name_disp_1__time_sin...,0.92,0.033,0.003,9.405,0.81,0.058,0.008,15.064,catb_onehot[selected]_rm_name_disp_1__time_sin...
25,catb_onehot[selected]_rm_name_fluid1__wf_bind_1,0.92,0.033,0.003,9.328,0.81,0.058,0.008,15.288,catb_onehot[selected]_rm_name_fluid1__wf_bind_1
27,catb_onehot[selected]_rm_name_fluid1__time_sin...,0.92,0.033,0.003,9.58,0.81,0.058,0.008,15.394,catb_onehot[selected]_rm_name_fluid1__time_sin...
