# Step 5: Machine Learning

In [1]:
from ml_tools.ensemble_learning import RegressionTreeModels, run_ensemble_pipeline
from paths import MICE_IMPUTED_DATASETS_DIR, VIF_IMPUTED_DATASETS_DIR, ENGINEERED_CSVS_DIR, MODEL_METRICS_DIR, TRAIN_DATASETS_DIR, LOGS_DIR
from helpers.constants import TARGETS
from ml_tools.logger import custom_logger
from ml_tools.utilities import distribute_datasets_by_target, save_dataframe, list_csv_paths

## 1. Distribute Datasets to train

In [2]:
for _csv_name, df_path in list_csv_paths(ENGINEERED_CSVS_DIR).items():
    for target_name, df in distribute_datasets_by_target(df_or_path=df_path, target_columns=TARGETS):
        save_dataframe(df=df, save_dir=TRAIN_DATASETS_DIR, filename=target_name)

for _csv_name, df_path in list_csv_paths(MICE_IMPUTED_DATASETS_DIR).items():
    for target_name, df in distribute_datasets_by_target(df_or_path=df_path, target_columns=TARGETS):
        save_dataframe(df=df, save_dir=TRAIN_DATASETS_DIR, filename=target_name + "_MICE")

for _csv_name, df_path in list_csv_paths(VIF_IMPUTED_DATASETS_DIR).items():
    for target_name, df in distribute_datasets_by_target(df_or_path=df_path, target_columns=TARGETS):
        save_dataframe(df=df, save_dir=TRAIN_DATASETS_DIR, filename=target_name + "_VIF")

🗂️ CSV files found:
	engineered_dataset

💿 Loaded dataset: 'engineered_dataset' with shape: (1220, 119)
✅ Saved dataset: 'capacitymAhg.csv' with shape: (1063, 117)
✅ Saved dataset: 'capacity_retention.csv' with shape: (974, 117)
✅ Saved dataset: 'first_coulombic_efficiency.csv' with shape: (702, 117)
🗂️ CSV files found:
	engineered_dataset_MICE

💿 Loaded dataset: 'engineered_dataset_MICE' with shape: (1220, 119)
✅ Saved dataset: 'capacitymAhg_MICE.csv' with shape: (1063, 117)
✅ Saved dataset: 'capacity_retention_MICE.csv' with shape: (974, 117)
✅ Saved dataset: 'first_coulombic_efficiency_MICE.csv' with shape: (702, 117)
🗂️ CSV files found:
	engineered_dataset_MICE_VIF

💿 Loaded dataset: 'engineered_dataset_MICE_VIF' with shape: (1220, 110)
✅ Saved dataset: 'capacitymAhg_VIF.csv' with shape: (1063, 108)
✅ Saved dataset: 'capacity_retention_VIF.csv' with shape: (974, 108)
✅ Saved dataset: 'first_coulombic_efficiency_VIF.csv' with shape: (702, 108)


## 2. Configure Factory class to create models

In [3]:
hyperparameters = {
    "L1_regularization": 0,
    "L2_regularization": 0,
    "learning_rate": 0.001,
    "n_estimators": 3000,   #xgb - lightgbm
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,    #xgb - lightgbm
    "min_samples_leaf": 50, #histgb
    "max_iter": 1000,   #histgb
    "min_child_weight": 3,  #xgb
    "gamma": 1, #xgb
    "num_leaves": 31,   #lightgbm
    "min_data_in_leaf": 40  #lightgbm
}

factory_class = RegressionTreeModels(**hyperparameters)

## 3. Train Ensemble models

In [4]:
run_ensemble_pipeline(datasets_dir=TRAIN_DATASETS_DIR,
             save_dir=MODEL_METRICS_DIR,
             target_columns=TARGETS,
             model_object=factory_class,
             save_model=True)

🗂️ CSV files found:
	capacity_retention_VIF
	capacitymAhg
	capacity_retention
	capacity_retention_MICE
	capacitymAhg_MICE
	first_coulombic_efficiency
	capacitymAhg_VIF
	first_coulombic_efficiency_MICE
	first_coulombic_efficiency_VIF

💿 Loaded dataset: 'capacity_retention_VIF' with shape: (974, 108)
	Training model: XGBoost for Target: capacity_retention(%)...
	Training model: LightGBM for Target: capacity_retention(%)...
	Training model: HistGB for Target: capacity_retention(%)...

💿 Loaded dataset: 'capacitymAhg' with shape: (1063, 117)
	Training model: XGBoost for Target: capacity(mAh/g)...
	Training model: LightGBM for Target: capacity(mAh/g)...
	Training model: HistGB for Target: capacity(mAh/g)...

💿 Loaded dataset: 'capacity_retention' with shape: (974, 117)
	Training model: XGBoost for Target: capacity_retention(%)...
	Training model: LightGBM for Target: capacity_retention(%)...
	Training model: HistGB for Target: capacity_retention(%)...

💿 Loaded dataset: 'capacity_retention_