# Training Pipelines

### Setup

In [1]:
import pandas as pd
from train_pipeline import train_model

def print_nested_keys(d, prefix=''):
    if isinstance(d, dict):
        for key in d:
            full_key = f"{prefix}.{key}" if prefix else key
            print(full_key)
            print_nested_keys(d[key], full_key)

def print_nested_keys_and_values(d, prefix=''):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key)
    else:
        print(f"{prefix}:\n{d}\n")

def print_nested_keys_and_values(d, prefix='', filter_key=None):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key, filter_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key, filter_key)
    else:
        if filter_key is None or prefix.endswith(filter_key):
            print(f"{prefix}:\n{d}\n")


In [2]:
feats_name = "reduced_feats"
shap_opt = 1
model_names =  [
    "catb_native",
    "catb_onehot[selected]",
    "catb_onehot_impute",
    "xgb_onehot", "xgb_impute",
    "lr", "lr_solidloading",
    "rf"
                ]
results = {}
# Metrics configurations
FEATURES_TO_ANALYZE = ["material_group", "name_part1", "name_fluid1", "name_part1_novel_in_test", "name_part1_freq_bin", "year"]
COLUMNS_NOT_NULL = ["name_part1", "name_fluid1", "vf_total", "material_group"]

## Catb native

In [3]:
model_name = "catb_native"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_native
----------------------------------------
Selected model: catb_native
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  split0_test_score  spli

## Catb + onehot

In [4]:
model_name = "catb_onehot[selected]"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  spl

## Catb + onehot + impute

In [5]:
model_name = "catb_onehot_impute"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot_impute
----------------------------------------
Selected model: catb_onehot_impute
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  split0_te

## XGB + onehot

In [6]:
model_name = "xgb_onehot"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: xgb_onehot
----------------------------------------
Selected model: xgb_onehot
Search space: [{'model__n_estimators': [50, 100, 250, 500, 1000], 'model__max_depth': [3, 5, 8]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth  param_model__n_estimators                                             params  split0_test_score  split1_test_score  split2_test_score  

## XGB + impute

In [7]:
model_name = "xgb_onehot_impute"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: xgb_onehot_impute
----------------------------------------
Selected model: xgb_onehot_impute
Search space: [{'model__n_estimators': [50, 100, 250, 500, 1000], 'model__max_depth': [3, 5, 8]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth  param_model__n_estimators                                             params  split0_test_score  split1_test_score  split

# Random Forest

In [8]:
model_name = "rf"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: rf
----------------------------------------
Selected model: rf
Search space: [{'model__n_estimators': [50, 100, 200], 'model__max_features': ['log2', 'sqrt', None], 'model__max_depth': [10, 20, 40]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth param_model__max_features  param_model__n_estimators                                             params  split0_t

## LR

In [9]:
import train_pipeline
import importlib
importlib.reload(train_pipeline)
model_name = "lr"
results[model_name] = train_pipeline.train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=0)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: lr
----------------------------------------
Selected model: lr
Search space: [{}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time params  split0_test_score  split1_test_score  split2_test_score  split3_test_score  split4_test_score  mean_test_score  std_test_score  rank_test_score
0       0.022969      0.014894         0.006598        0.000593     {}           0.418302           0.43

## LR  (solid loading only)

In [10]:
model_name = "lr_solidloading"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=0)


Parsing data...
Using only Solid Loading feature for lr_solidloading model
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
lr_solidloading selected. Removing other features except Solid Loading
Model selected: lr_solidloading
----------------------------------------
Selected model: lr_solidloading
Search space: [{}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time params  split0_test_score  split1_test_score  split2_test_score  split3_test_score  split4_test_scor

## ALL metrics

In [11]:
print_nested_keys(results)

catb_native
catb_native.overall_metrics
catb_native.no_nan_metrics
catb_native.only_nans_metrics
catb_native.group_metrics
catb_native.group_metrics.material_group
catb_native.group_metrics.material_group.topn
catb_native.group_metrics.material_group.all
catb_native.group_metrics.name_part1
catb_native.group_metrics.name_part1.topn
catb_native.group_metrics.name_part1.all
catb_native.group_metrics.name_fluid1
catb_native.group_metrics.name_fluid1.topn
catb_native.group_metrics.name_fluid1.all
catb_native.group_metrics.name_part1_novel_in_test
catb_native.group_metrics.name_part1_novel_in_test.topn
catb_native.group_metrics.name_part1_novel_in_test.all
catb_native.group_metrics.name_part1_freq_bin
catb_native.group_metrics.name_part1_freq_bin.topn
catb_native.group_metrics.name_part1_freq_bin.all
catb_native.group_metrics.year
catb_native.group_metrics.year.topn
catb_native.group_metrics.year.all
catb_native.additional_metrics
catb_onehot[selected]
catb_onehot[selected].overall_metrics


In [12]:
print_nested_keys_and_values(results)

catb_native.overall_metrics:
      pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape
0  catb_native     0.88     0.045     0.005     14.117    0.77    0.067    0.010    17.600

catb_native.no_nan_metrics:
      pipeline  count no_nan_r2 no_nan_mae no_nan_mse no_nan_mape
0  catb_native     23      0.79      0.046      0.005      11.722

catb_native.only_nans_metrics:
      pipeline  count only_nans_r2 only_nans_mae only_nans_mse only_nans_mape
0  catb_native    349         0.77         0.068         0.010         17.987

catb_native.group_metrics.material_group.topn:
    material_group train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0          Ceramic     0.86     0.046     0.005     14.077         1276    0.74    0.066    0.009    15.807         311
1          Polymer     0.91     0.031     0.002      4.305           77    0.54    0.062    0.006     8.500          14
2            Metal     0.90     

## Overall Results

In [13]:
dfs = []
for model, data in results.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

                pipeline train_r2 train_mae train_mse train_mape test_r2 test_mae test_mse test_mape                  model
1  catb_onehot[selected]     0.92     0.034     0.003      9.704    0.81    0.059    0.008    15.148  catb_onehot[selected]
2     catb_onehot_impute     0.92     0.033     0.003      9.404    0.81    0.058    0.008    14.187     catb_onehot_impute
5                     rf     0.92     0.033     0.004     11.348    0.80    0.061    0.009    17.779                     rf
3             xgb_onehot     0.93     0.026     0.003      7.451    0.78    0.061    0.009    14.498             xgb_onehot
0            catb_native     0.88     0.045     0.005     14.117    0.77    0.067    0.010    17.600            catb_native
4      xgb_onehot_impute     0.93     0.029     0.003      8.374    0.77    0.062    0.009    15.218      xgb_onehot_impute
6                     lr     0.43     0.118     0.024     42.395    0.35    0.131    0.027    42.102                     lr
7       

## By Material Group

In [14]:
print_nested_keys_and_values(results, filter_key="group_metrics.material_group.all")

catb_native.group_metrics.material_group.all:
    material_group train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0          Ceramic     0.86     0.046     0.005     14.077         1276    0.74    0.066    0.009    15.807         311
1  Ceramic/Polymer     0.86     0.049     0.006     10.534           36    0.88    0.055    0.006    16.949           9
2            Metal     0.90     0.043     0.004     11.616           66    0.84    0.061    0.009    18.527          24
3    Metal/Ceramic     0.86     0.037     0.004     56.689           27    0.66    0.120    0.024    70.332          13
4   Pharmaceutical     0.47     0.020     0.001      4.308            5     nan    0.003    0.000     0.683           1
5          Polymer     0.91     0.031     0.002      4.305           77    0.54    0.062    0.006     8.500          14

catb_onehot[selected].group_metrics.material_group.all:
    material_group train_r2 train_mae train_mse train_map

## By Year

In [15]:
print_nested_keys_and_values(results, filter_key="catb_onehot[selected].group_metrics.year.all")

catb_onehot[selected].group_metrics.year.all:
      year train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0   2001.0     0.96     0.023     0.001     25.019           26    0.79    0.043    0.002    10.186           4
1   2002.0     0.89     0.016     0.000      2.663            9   -0.10    0.073    0.011    15.283           4
2   2004.0     0.94     0.055     0.004    108.948           19    0.83    0.069    0.007    21.792           4
3   2005.0     0.83     0.054     0.007     20.211           15    0.89    0.084    0.009    19.349           3
4   2006.0     0.99     0.018     0.001      7.327           28    0.93    0.051    0.004    38.179           4
5   2007.0     0.97     0.027     0.002     13.362           75    0.94    0.052    0.005    15.225          20
6   2008.0     0.97     0.017     0.001      3.445           88    0.82    0.051    0.006    23.538          18
7   2009.0     0.94     0.026     0.002      7.842        

## By novel materials

In [16]:
filter_key = "catb_onehot[selected].group_metrics.name_part1_novel_in_test.all"
print_nested_keys_and_values(results, filter_key=filter_key)


catb_onehot[selected].group_metrics.name_part1_novel_in_test.all:
   name_part1_novel_in_test train_r2 train_mae train_mse train_mape  train_count test_r2 test_mae test_mse test_mape  test_count
0                     False     0.91     0.035     0.004      9.998         1408    0.82    0.057    0.007    15.086         342
1                      True     0.98     0.020     0.001      4.467           79    0.63    0.078    0.012    15.856          30

