# Training Pipelines

### Setup

In [1]:
import pandas as pd
from train_pipeline import train_model

def print_nested_keys(d, prefix=''):
    if isinstance(d, dict):
        for key in d:
            full_key = f"{prefix}.{key}" if prefix else key
            print(full_key)
            print_nested_keys(d[key], full_key)

def print_nested_keys_and_values(d, prefix=''):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key)
    else:
        print(f"{prefix}:\n{d}\n")

def print_nested_keys_and_values(d, prefix='', filter_key=None):
    if isinstance(d, dict):
        for key, value in d.items():
            full_key = f"{prefix}.{key}" if prefix else key
            print_nested_keys_and_values(value, full_key, filter_key)
    elif isinstance(d, list):
        for idx, item in enumerate(d):
            full_key = f"{prefix}[{idx}]"
            print_nested_keys_and_values(item, full_key, filter_key)
    else:
        if filter_key is None or prefix.endswith(filter_key):
            print(f"{prefix}:\n{d}\n")


In [2]:
feats_name = "reduced_feats"
shap_opt = 1
model_names =  [
    "catb_native",
    "catb_onehot[selected]",
    "catb_onehot_impute",
    "xgb_onehot", "xgb_impute",
    "lr", "lr_solidloading",
    "rf"
                ]
results = {}
# Metrics configurations
FEATURES_TO_ANALYZE = ["material_group", "name_part1", "name_fluid1", "name_part1_novel_in_test", "name_part1_freq_bin", "year"]
COLUMNS_NOT_NULL = ["name_part1", "name_fluid1", "vf_total", "material_group"]

## Catb native

In [3]:
model_name = "catb_native"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_native
----------------------------------------
Selected model: catb_native
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  split0_test_score  spli

## Catb + onehot

In [4]:
model_name = "catb_onehot[selected]"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot[selected]
----------------------------------------
Selected model: catb_onehot[selected]
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  spl

## Catb + onehot + impute

In [5]:
model_name = "catb_onehot_impute"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: catb_onehot_impute
----------------------------------------
Selected model: catb_onehot_impute
Search space: [{'model__n_estimators': [250, 500, 1000], 'model__depth': [8], 'model__eval_metric': ['MAE']}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__depth param_model__eval_metric  param_model__n_estimators                                             params  split0_te

## XGB + onehot

In [6]:
model_name = "xgb_onehot"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: xgb_onehot
----------------------------------------
Selected model: xgb_onehot
Search space: [{'model__n_estimators': [50, 100, 250, 500, 1000], 'model__max_depth': [3, 5, 8]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth  param_model__n_estimators                                             params  split0_test_score  split1_test_score  split2_test_score  

## XGB + impute

In [8]:
model_name = "xgb_onehot_impute"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: xgb_onehot_impute
----------------------------------------
Selected model: xgb_onehot_impute
Search space: [{'model__n_estimators': [50, 100, 250, 500, 1000], 'model__max_depth': [3, 5, 8]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth  param_model__n_estimators                                             params  split0_test_score  split1_test_score  split

# Random Forest

In [9]:
model_name = "rf"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=shap_opt)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: rf
----------------------------------------
Selected model: rf
Search space: [{'model__n_estimators': [50, 100, 200], 'model__max_features': ['log2', 'sqrt', None], 'model__max_depth': [10, 20, 40]}]
Fitting Grid...
CV Results
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_model__max_depth param_model__max_features  param_model__n_estimators                                             params  split0_t

## LR

In [10]:
import train_pipeline
import importlib
importlib.reload(train_pipeline)
model_name = "lr"
results[model_name] = train_pipeline.train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=0)

Parsing data...
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
Model selected: lr
----------------------------------------
Selected model: lr
Search space: [{}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time params  split0_test_score  split1_test_score  split2_test_score  split3_test_score  split4_test_score  mean_test_score  std_test_score  rank_test_score
0        0.02118      0.010409         0.006543        0.000786     {}           0.418302           0.43

## LR  (solid loading only)

In [11]:
model_name = "lr_solidloading"
results[model_name] = train_model(model_name=model_name, feats_name="reduced_feats", shap_opt=0)


Parsing data...
Using only Solid Loading feature for lr_solidloading model
Material in test but has less than 6 occurrences in train:
Unique materials in test with less than 6 occurrences in train:
name_part1
NiTiNbO            4
Ni                 3
CaSiO3             3
polysiloxane       3
Si                 2
fireclay           2
Zeo                2
LSCF               1
BT                 1
theophylline       1
Ca3ZrSi2O9         1
stainless steel    1
CuO                1
PU                 1
AlN                1
Merwinite          1
Cr2O3              1
CaP                1
Name: count, dtype: int64
lr_solidloading selected. Removing other features except Solid Loading
Model selected: lr_solidloading
----------------------------------------
Selected model: lr_solidloading
Search space: [{}]
Fitting Grid...
CV Results
    mean_fit_time  std_fit_time  mean_score_time  std_score_time params  split0_test_score  split1_test_score  split2_test_score  split3_test_score  split4_test_scor

## ALL metrics

In [12]:
print_nested_keys(results)

catb_native
catb_native.overall_metrics
catb_native.no_nan_metrics
catb_native.only_nans_metrics
catb_native.group_metrics
catb_native.group_metrics.material_group
catb_native.group_metrics.material_group.topn
catb_native.group_metrics.material_group.all
catb_native.group_metrics.name_part1
catb_native.group_metrics.name_part1.topn
catb_native.group_metrics.name_part1.all
catb_native.group_metrics.name_fluid1
catb_native.group_metrics.name_fluid1.topn
catb_native.group_metrics.name_fluid1.all
catb_native.group_metrics.name_part1_novel_in_test
catb_native.group_metrics.name_part1_novel_in_test.topn
catb_native.group_metrics.name_part1_novel_in_test.all
catb_native.group_metrics.name_part1_freq_bin
catb_native.group_metrics.name_part1_freq_bin.topn
catb_native.group_metrics.name_part1_freq_bin.all
catb_native.group_metrics.year
catb_native.group_metrics.year.topn
catb_native.group_metrics.year.all
catb_native.additonal_metrics
catb_onehot[selected]
catb_onehot[selected].overall_metrics
c

In [13]:
print_nested_keys_and_values(results)

catb_native.overall_metrics:
      pipeline  train_r2  train_mae  train_mse  train_mape  test_r2  test_mae  test_mse  test_mape
0  catb_native    -96457         61       4061       10588   -95580        61      4019      10617

catb_native.no_nan_metrics:
      pipeline  count  no_nan_r2  no_nan_mae  no_nan_mse  no_nan_mape
0  catb_native     23    -153741          60        3828        10778

catb_native.only_nans_metrics:
      pipeline  count  only_nans_r2  only_nans_mae  only_nans_mse  only_nans_mape
0  catb_native    349        -93527             61           4031           10606

catb_native.group_metrics.material_group.topn:
    material_group  train_r2  train_mae  train_mse  train_mape  train_count  test_r2  test_mae  test_mse  test_mape  test_count
0          Ceramic         1          0          0          14         1276        1         0         0         16         311
1          Polymer         1          0          0           4           77        1         0         0

## Overall Results

In [14]:
dfs = []
for model, data in results.items():
    if 'overall_metrics' in data:
        df = data['overall_metrics'].copy()
        df['model'] = model
        dfs.append(df)

sorted_metrics = pd.concat(dfs, ignore_index=True).sort_values(by='test_r2', ascending=False)
print(sorted_metrics)

                pipeline  train_r2  train_mae  train_mse  train_mape  test_r2  test_mae  test_mse  test_mape                  model
7        lr_solidloading    -92058         61       3876       12708   -93071        61      3913      12769        lr_solidloading
6                     lr    -92781         61       3907       12597   -93354        61      3925      12404                     lr
4      xgb_onehot_impute    -97512         61       4106       10240   -95254        60      4005      10262      xgb_onehot_impute
3             xgb_onehot    -97645         61       4112       10209   -95407        60      4011      10199             xgb_onehot
0            catb_native    -96457         61       4061       10588   -95580        61      4019      10617            catb_native
2     catb_onehot_impute    -97194         61       4093       10311   -95841        61      4030      10383     catb_onehot_impute
1  catb_onehot[selected]    -97157         61       4091       10327   -9600

## By Material Group

In [15]:
print_nested_keys_and_values(results, filter_key="group_metrics.material_group.all")

catb_native.group_metrics.material_group.all:
    material_group  train_r2  train_mae  train_mse  train_mape  train_count test_r2  test_mae  test_mse  test_mape  test_count
0          Ceramic         1          0          0          14         1276       1         0         0         16         311
1  Ceramic/Polymer         1          0          0          11           36       1         0         0         17           9
2            Metal         1          0          0          12           66       1         0         0         19          24
3    Metal/Ceramic         1          0          0          57           27       1         0         0         70          13
4   Pharmaceutical         0          0          0           4            5     nan         0         0          1           1
5          Polymer         1          0          0           4           77       1         0         0          8          14

catb_onehot[selected].group_metrics.material_group.all:
    mate

## By Year

In [16]:
print_nested_keys_and_values(results, filter_key="group_metrics.year.all")

catb_native.group_metrics.year.all:
      year  train_r2  train_mae  train_mse  train_mape  train_count  test_r2  test_mae  test_mse  test_mape  test_count
0   2001.0         1          0          0          23           26        1         0         0          9           4
1   2002.0         1          0          0           3            9        0         0         0         14           4
2   2004.0         1          0          0         130           19        1         0         0         20           4
3   2005.0         1          0          0          18           15        1         0         0         16           3
4   2006.0         1          0          0          30           28        1         0         0         44           4
5   2007.0         1          0          0          38           75        1         0         0         28          20
6   2008.0         1          0          0           7           88        1         0         0         37          18
7   