# Training Pipelines

### Setup

In [1]:
import pandas as pd
from train_pipeline import train_model

def print_nested_keys(d, prefix=''):
    if isinstance(d, dict):
        for key in d:
            full_key = f"{prefix}.{key}" if prefix else key
            print(full_key)
            print_nested_keys(d[key], full_key)

def print_nested_keys_and_values(d, prefix=''):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key)
    else:
        print(f"{prefix}:\n{d}\n")

def print_nested_keys_and_values(d, prefix='', filter_key=None):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key, filter_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key, filter_key)
    else:
        if filter_key is None or prefix.endswith(filter_key):
            print(f"{prefix}:\n{d}\n")


In [2]:
feats_name = "reduced_feats"
shap_opt = 0
model_names =  [
    "catb_native",
    "catb_onehot[selected]",
    "catb_onehot_impute",
    "xgb_onehot", "xgb_impute",
    "lr", "lr_solidloading",
    "rf"
                ]
results = {}
# Metrics configurations
FEATURES_TO_ANALYZE = ["material_group", "name_part1", "name_fluid1", "name_part1_novel_in_test", "name_part1_freq_bin", "year"]
COLUMNS_NOT_NULL = ["name_part1", "name_fluid1", "vf_total", "material_group"]

## Catb native

In [3]:
model_name = "catb_native"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_native
----------------------------------------
Selected model: catb_native
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  split0_test_score  spli

## Catb + onehot

In [4]:
model_name = "catb_onehot[selected]"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  spl

## Catb + onehot + impute

In [5]:
model_name = "catb_onehot_impute"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot_impute
----------------------------------------
Selected model: catb_onehot_impute
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  split0_te

## XGB + onehot

In [6]:
model_name = "xgb_onehot"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: xgb_onehot
----------------------------------------
Selected model: xgb_onehot
Search space: [{'model__n_estimators': [50, 100, 250, 500, 1000], 'model__max_depth': [3, 5, 8]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth  param_model__n_estimators                                             params  split0_test_score  split1_test_score  split2_test_score  

## XGB + impute

In [7]:
model_name = "xgb_onehot_impute"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: xgb_onehot_impute
----------------------------------------
Selected model: xgb_onehot_impute
Search space: [{'model__n_estimators': [50, 100, 250, 500, 1000], 'model__max_depth': [3, 5, 8]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth  param_model__n_estimators                                             params  split0_test_score  split1_test_score  split

# Random Forest

In [8]:
model_name = "rf"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: rf
----------------------------------------
Selected model: rf
Search space: [{'model__n_estimators': [50, 100, 200], 'model__max_features': ['log2', 'sqrt', None], 'model__max_depth': [10, 20, 40]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth param_model__max_features  param_model__n_estimators                                             params  split0_t

## LR

In [9]:
import train_pipeline
import importlib
importlib.reload(train_pipeline)
model_name = "lr"
results[model_name] = train_pipeline.train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=0)

Parsing data...
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: lr
----------------------------------------
Selected model: lr
Search space: [{}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time params  split0_test_score  split1_test_score  split2_test_score  split3_test_score  split4_test_score  mean_test_score  std_test_score  rank_test_score
0       0.015881      0.005858         0.006885        0.001972     {}           0.417892           0.43

## LR  (solid loading only)

In [10]:
model_name = "lr_solidloading"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=0)


Parsing data...
Using only Solid Loading feature for lr_solidloading model
Material in test but has less than 5 occurrences in train:
Unique materials in test with less than 5 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
lr_solidloading selected. Removing other features except Solid Loading
Model selected: lr_solidloading
----------------------------------------
Selected model: lr_solidloading
Search space: [{}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time params  split0_test_score  split1_test_score  split2_test_score  split3_test_score  split4_test_scor

## ALL metrics

In [11]:
print_nested_keys(results)

catb_native
catb_native.overall_metrics
catb_native.no_nan_metrics
catb_native.only_nans_metrics
catb_native.group_metrics
catb_native.group_metrics.material_group
catb_native.group_metrics.material_group.topn
catb_native.group_metrics.material_group.all
catb_native.group_metrics.name_part1
catb_native.group_metrics.name_part1.topn
catb_native.group_metrics.name_part1.all
catb_native.group_metrics.name_fluid1
catb_native.group_metrics.name_fluid1.topn
catb_native.group_metrics.name_fluid1.all
catb_native.group_metrics.name_part1_novel_in_test
catb_native.group_metrics.name_part1_novel_in_test.topn
catb_native.group_metrics.name_part1_novel_in_test.all
catb_native.group_metrics.name_part1_freq_bin
catb_native.group_metrics.name_part1_freq_bin.topn
catb_native.group_metrics.name_part1_freq_bin.all
catb_native.group_metrics.year
catb_native.group_metrics.year.topn
catb_native.group_metrics.year.all
catb_native.additional_metrics
catb_native.additional_metrics.title
catb_native.additional_

In [12]:
print_nested_keys_and_values(results)

catb_native.overall_metrics:
      pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape
0  catb_native     0.88     0.047     0.005     14.553    0.75    0.069    0.010    18.243

catb_native.no_nan_metrics:
      pipeline  count no_nan_r2 no_nan_mae no_nan_mse no_nan_mape
0  catb_native     27      0.37      0.075      0.014      15.834

catb_native.only_nans_metrics:
      pipeline  count only_nans_r2 only_nans_mae only_nans_mse only_nans_mape
0  catb_native    345         0.77         0.068         0.010         18.432

catb_native.group_metrics.material_group.topn:
    material_group train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0          Ceramic     0.85     0.048     0.006     14.601         1276    0.72    0.068    0.010    16.671         311
1          Polymer     0.92     0.030     0.002      4.226           77    0.54    0.058    0.006     8.149          14
2            Metal     0.90     

## Overall Results

In [13]:
dfs = []
for model, data in results.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

                pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape                  model
1  catb_onehot[selected]     0.92     0.035     0.004      9.793    0.81    0.059    0.008    14.969  catb_onehot[selected]
2     catb_onehot_impute     0.91     0.036     0.004      9.856    0.81    0.060    0.008    14.625     catb_onehot_impute
3             xgb_onehot     0.91     0.036     0.004     10.263    0.79    0.064    0.009    16.292             xgb_onehot
5                     rf     0.91     0.035     0.004     11.674    0.78    0.063    0.009    17.745                     rf
4      xgb_onehot_impute     0.91     0.036     0.004     10.027    0.76    0.063    0.010    14.536      xgb_onehot_impute
0            catb_native     0.88     0.047     0.005     14.553    0.75    0.069    0.010    18.243            catb_native
6                     lr     0.43     0.118     0.024     42.504    0.35    0.130    0.027    42.214                     lr
7       

## By Material Group

In [14]:
print_nested_keys_and_values(results, filter_key="group_metrics.material_group.all")

catb_native.group_metrics.material_group.all:
    material_group train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0          Ceramic     0.85     0.048     0.006     14.601         1276    0.72    0.068    0.010    16.671         311
1  Ceramic/Polymer     0.85     0.048     0.006     11.101           36    0.88    0.049    0.006    16.390           9
2            Metal     0.90     0.046     0.004     12.176           66    0.82    0.068    0.010    19.900          24
3    Metal/Ceramic     0.88     0.036     0.003     53.838           27    0.59    0.129    0.029    66.169          13
4   Pharmaceutical     0.11     0.026     0.001      5.505            5     nan    0.011    0.000     2.437           1
5          Polymer     0.92     0.030     0.002      4.226           77    0.54    0.058    0.006     8.149          14

catb_onehot[selected].group_metrics.material_group.all:
    material_group train_r2 train_mae train_mse train_map

## By Year

In [15]:
print_nested_keys_and_values(results, filter_key="catb_onehot[selected].group_metrics.year.all")

catb_onehot[selected].group_metrics.year.all:
      year train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0   2001.0     0.96     0.023     0.001     24.588           26    0.84    0.040    0.002     9.575           4
1   2002.0     0.94     0.012     0.000      1.991            9   -0.08    0.073    0.011    15.462           4
2   2004.0     0.94     0.052     0.004    106.465           19    0.79    0.078    0.009    26.113           4
3   2005.0     0.82     0.056     0.007     21.051           15    0.90    0.074    0.008    17.510           3
4   2006.0     0.99     0.017     0.000      8.033           28    0.95    0.049    0.003    28.498           4
5   2007.0     0.97     0.028     0.002     12.498           75    0.94    0.052    0.005    15.451          20
6   2008.0     0.97     0.018     0.001      3.508           88    0.80    0.054    0.007    25.514          18
7   2009.0     0.93     0.028     0.002      8.376        

## By novel materials

In [16]:
filter_key = "catb_onehot[selected].group_metrics.name_part1_novel_in_test.all"
print_nested_keys_and_values(results, filter_key=filter_key)


catb_onehot[selected].group_metrics.name_part1_novel_in_test.all:
   name_part1_novel_in_test train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0                     False     0.91     0.036     0.004      9.896         1443    0.82    0.057    0.007    14.976         342
1                      True     0.98     0.024     0.001      6.429           44    0.60    0.079    0.014    14.886          30



In [17]:
filter_key = "catb_onehot_impute.group_metrics.name_part1_novel_in_test.all"
print_nested_keys_and_values(results, filter_key=filter_key)


catb_onehot_impute.group_metrics.name_part1_novel_in_test.all:
   name_part1_novel_in_test train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0                     False     0.91     0.036     0.004      9.884         1443    0.82    0.058    0.008    14.533         342
1                      True     0.98     0.025     0.001      8.909           44    0.60    0.081    0.014    15.670          30



In [18]:
for model_key in results:
    filter_key = f"{model_key}.group_metrics.name_part1_novel_in_test.all"
    print(f"\nModel: {model_key}")
    print_nested_keys_and_values(results, filter_key=filter_key)



Model: catb_native
catb_native.group_metrics.name_part1_novel_in_test.all:
   name_part1_novel_in_test train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0                     False     0.87     0.047     0.005     14.197         1443    0.76    0.068    0.010    18.281         342
1                      True     0.95     0.040     0.003     26.223           44    0.57    0.082    0.015    17.815          30


Model: catb_onehot[selected]
catb_onehot[selected].group_metrics.name_part1_novel_in_test.all:
   name_part1_novel_in_test train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0                     False     0.91     0.036     0.004      9.896         1443    0.82    0.057    0.007    14.976         342
1                      True     0.98     0.024     0.001      6.429           44    0.60    0.079    0.014    14.886          30


Model: catb_onehot_impute
catb_onehot_impute.group_

## By novel material frequency in train

In [19]:
filter_key = "catb_onehot[selected].group_metrics.name_part1_freq_bin.all"
print_nested_keys_and_values(results, filter_key=filter_key)


catb_onehot[selected].group_metrics.name_part1_freq_bin.all:
  name_part1_freq_bin train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0                  10     0.95     0.031     0.002      5.473           73    0.65    0.069    0.010    13.165          32
1                 200     0.86     0.037     0.004      9.502          359    0.85    0.051    0.005    11.898          77
2                200+     0.92     0.037     0.004     11.783          643    0.80    0.059    0.008    14.465         166
3                   5     0.98     0.021     0.001      5.034           79    0.84    0.053    0.006    11.197          26
4                  50     0.92     0.034     0.004      8.342          333    0.89    0.051    0.006    20.643          67
5                Rare     None      None      None       None            0  -11.93    0.252    0.063    38.863           4



In [28]:
dfs = []
for model, data in results.items():
    if 'overall_metrics' in data:
        if model == 'lr' or model == 'lr_solidloading':
            continue
        df = data['group_metrics']['name_part1_freq_bin']['all'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
sorted_metrics[['model', 'name_part1_freq_bin', 'train_count','test_count', 'test_r2', 'test_mae']]

Unnamed: 0,model,name_part1_freq_bin,train_count,test_count,test_r2,test_mae
10,catb_onehot[selected],50,333,67,0.89,0.051
28,xgb_onehot_impute,50,333,67,0.88,0.051
16,catb_onehot_impute,50,333,67,0.88,0.052
22,xgb_onehot,50,333,67,0.87,0.057
34,rf,50,333,67,0.86,0.058
7,catb_onehot[selected],200,359,77,0.85,0.051
13,catb_onehot_impute,200,359,77,0.85,0.051
19,xgb_onehot,200,359,77,0.84,0.052
9,catb_onehot[selected],5,79,26,0.84,0.053
3,catb_native,5,79,26,0.83,0.055


## By Additional recent paper materials

In [20]:
filter_key = "catb_onehot[selected].additional_metrics.title"
print_nested_keys_and_values(results, filter_key=filter_key)


catb_onehot[selected].additional_metrics.title:
                                                   count       r2    mae    mse    mape
Bone-like structure by modified freeze casting         2   -30.69  0.182  0.033  31.798
Design of porous aluminum oxide ceramics using ...     3   -22.49  0.153  0.025  20.978
Dual-Scale Porosity Alumina Structures Using Ce...     2    -0.20  0.129  0.020  20.780
Freeze casting of hydroxyapatite-titania compos...     4  -255.92  0.156  0.025  29.170
The Production of Porous Hydroxyapatite Scaffol...     6     0.77  0.067  0.012  47.391



In [21]:
filter_key = "catb_onehot_impute.additional_metrics"
print_nested_keys_and_values(results, filter_key=filter_key)


##