# Hyper Parameter Optimization For ExtraRandomTrees

In [None]:
# import libraries
import os
import sys
import numpy as np
import pandas as pd
# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
MODEL_NAME = 'ExtraRandomTrees'

In [None]:
current_path = os.getcwd()
current_path

In [None]:
# To import the custom module from a specific path
sys.path.insert(0, os.path.join(current_path, '../'))
# Importing the custom module
from Utools.SingleModel import SingleModel

In [None]:
from sklearn.base import clone
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Load Data

In [None]:
# Load data
file_dir = os.path.join(current_path, '../Data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'train_data', 'mp_train_set.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'mp_test_set.csv'))
# exp data
exp_train = pd.read_csv(os.path.join(file_dir, 'train_data', 'exp_train_set.csv'))
exp_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'exp_test_set.csv'))
# mix test data
mix_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'mix_test_set.csv'))

dft_train_X = dft_train.drop(columns=['composition', 'band_gap'])
dft_train_y = dft_train['band_gap']
exp_train_X = exp_train.drop(columns=['composition', 'band_gap'])
exp_train_y = exp_train['band_gap']
dft_test_X = dft_test.drop(columns=['composition', 'band_gap'])
dft_test_y = dft_test['band_gap']
exp_test_X = exp_test.drop(columns=['composition', 'band_gap'])
exp_test_y = exp_test['band_gap']

mix_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'mix_test_set.csv'))
mixed_test_X = mix_test.drop(columns=['composition', 'band_gap'])
mixed_test_y = mix_test['band_gap']

## Model Train

### DFT Model

In [None]:
# using grid search to find the best hyperparameters for the dft model
print(f"['DFT {MODEL_NAME}'] HP optimazation begin...")

# create a pipeline with a scaler and the model
extra_trees_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('et', ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1))
])
# define the parameter grid for model (total 720 combinations)
# cpu = 60, run_time = 
param_distributions = {
    'et__n_estimators': [100, 300, 500],  # number of trees in the forest
    'et__max_depth': [None, 5, 7, 10],  # maximum depth of the tree, None means nodes are expanded until all leaves are pure
    'et__min_samples_split': [2, 5, 7, 10],  # minimum number of samples required to split an internal node
    'et__min_samples_leaf': [1, 2, 4],    # minimum number of samples required to be at a leaf node
    'et__max_features': ['sqrt', 'log2', 0.25, 0.5, 1]  # number of features to consider when looking for the best split, None means all features
}

# create a KFold object for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# create a GridSearchCV object 
# n_iter=10, time=30s
random_search = RandomizedSearchCV(estimator=extra_trees_pipe, param_distributions=param_distributions,
                n_iter=360, cv=kf, verbose=1, random_state=RANDOM_SEED, 
                scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)

# fit the model to the training data
random_search.fit(dft_train_X, dft_train_y)

In [None]:
# get the best hyperparameters
best_params = random_search.best_params_
print(f"Best Hyperparameters: {best_params}")
best_score = np.sqrt(-random_search.best_score_)
print(f"Best Cross-Validation RMSE: {best_score:.4f}")

# create a DataFrame to store the results
results = pd.DataFrame(random_search.cv_results_)
results['test_rmse'] = np.sqrt(-results['mean_test_score'])
results['train_rmse'] = np.sqrt(-results['mean_train_score'])
# sort the results by test RMSE
results_sorted = results.sort_values('test_rmse')
results_sorted = results_sorted[['rank_test_score', 'param_et__n_estimators', 'param_et__max_depth', 
                                 'param_et__min_samples_split', 'param_et__min_samples_leaf', 'param_et__max_features', 'train_rmse', 'test_rmse']]
# save the results to a csv file
hp_path = os.path.join(current_path, 'hp_search')
os.makedirs(hp_path, exist_ok=True)
results_sorted.to_csv(os.path.join(hp_path, 'ERT_dft_hp.csv'), index=False)
results_sorted.head()

### Overfitting check

In [None]:
# 
# Train the model with the best hyperparameters on the entire training set
# using the best hyperparameters from the random search
extra_trees_best = ExtraTreesRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_split=7,
    min_samples_leaf=1,
    max_features=0.5,
    random_state=RANDOM_SEED,
    n_jobs=-1
)
# record model metrics
model_metrics = pd.DataFrame(columns=['Model', 'Error_Type', 'Train_set', 'Test_set', 'R²', 'RMSE', 'MAE'])
# Train the model on dft data
print("#" * 100)
print(f"[DFT - {MODEL_NAME}] Training {MODEL_NAME} on DFT data:")
print("-" * 100)

dft_model = SingleModel(clone(extra_trees_best), random_state=RANDOM_SEED)
dft_model.train(dft_train_X, dft_train_y)

# evaluate on the dft train set
print(f"[Train_error] Evaluating DFT {MODEL_NAME} on DFT train set:")
metrics = dft_model.evaluate(dft_train_X, dft_train_y)
model_metrics.loc[len(model_metrics)] = {
'Model': MODEL_NAME,
'Error_Type': 'Train',
'Train_set': 'dft',
'Test_set': 'dft',
'R²': metrics['r2'],
'RMSE': metrics['rmse'],
'MAE': metrics['mae']
}

# evaluate on the dft test set
print(f"[Test DFT -> DFT] Evaluating DFT {MODEL_NAME} on DFT test set:")

metrics = dft_model.evaluate(dft_test_X, dft_test_y)
model_metrics.loc[len(model_metrics)] = {
'Model': MODEL_NAME,
'Error_Type': 'Test',
'Train_set': 'dft',
'Test_set': 'dft',
'R²': metrics['r2'],
'RMSE': metrics['rmse'],
'MAE': metrics['mae']
}
# evaluate on the exp test set
print(f"[Test DFT -> EXP] Evaluating DFT {MODEL_NAME} on EXP test set:")
metrics = dft_model.evaluate(exp_test_X, exp_test_y)
model_metrics.loc[len(model_metrics)] = {
'Model': MODEL_NAME,
'Error_Type': 'Test',
'Train_set': 'dft',
'Test_set': 'exp',
'R²': metrics['r2'],
'RMSE': metrics['rmse'],
'MAE': metrics['mae']
}

In [None]:
model_metrics.head()

### EXP Model

In [None]:
# using grid search to find the best hyperparameters for the dft model
print(f"['EXP {MODEL_NAME}'] HP optimazation begin...")

# create a pipeline with a scaler and the model
extra_trees_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('et', ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1))
])
# define the parameter grid for model(total 960 combinations)
param_distributions = {
    'et__n_estimators': [200, 300, 500],  # number of trees in the forest
    'et__max_depth': [None, 5, 7, 10],  # maximum depth of the tree, None means nodes are expanded until all leaves are pure
    'et__min_samples_split': [2, 4, 5, 8],  # minimum number of samples required to split an internal node
    'et__min_samples_leaf': [1, 2, 3, 4],    # minimum number of samples required to be at a leaf node
    'et__max_features': ['sqrt', 'log2', 0.25, 0.5, 1]  # number of features to consider when looking for the best split, None means all features
}

# create a KFold object for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
# create a GridSearchCV object
grid_search = GridSearchCV(estimator=extra_trees_pipe, param_grid=param_distributions,
                cv=kf, verbose=1, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)

# fit the model to the training data
grid_search.fit(exp_train_X, exp_train_y)

In [None]:
# get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")
best_score = np.sqrt(-grid_search.best_score_)
print(f"Best Cross-Validation RMSE: {best_score:.4f}")

# create a DataFrame to store the results
results = pd.DataFrame(grid_search.cv_results_)
results['test_rmse'] = np.sqrt(-results['mean_test_score'])
results['train_rmse'] = np.sqrt(-results['mean_train_score'])
# sort the results by test RMSE
results_sorted = results.sort_values('test_rmse')
results_sorted = results_sorted[['rank_test_score', 'param_et__n_estimators', 'param_et__max_depth', 
                                 'param_et__min_samples_split', 'param_et__min_samples_leaf', 'param_et__max_features', 'train_rmse', 'test_rmse']]
# save the results to a csv file
hp_path = os.path.join(current_path, 'hp_search')
os.makedirs(hp_path, exist_ok=True)
results_sorted.to_csv(os.path.join(hp_path, 'ERT_exp_hp.csv'), index=False)
results_sorted.head()

### Overfitting check

In [None]:
# 
# Train the model with the best hyperparameters on the entire training set
# using the best hyperparameters from the random search
extra_trees_best = ExtraTreesRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=0.5,
    random_state=RANDOM_SEED,
    n_jobs=-1
)
# record model metrics
model_metrics = pd.DataFrame(columns=['Model', 'Error_Type', 'Train_set', 'Test_set', 'R²', 'RMSE', 'MAE'])
# Train the model on dft data
print("#" * 100)
print(f"[EXP - {MODEL_NAME}] Training {MODEL_NAME} on EXP data:")
print("-" * 100)

exp_model = SingleModel(clone(extra_trees_best), random_state=RANDOM_SEED)
exp_model.train(exp_train_X, exp_train_y)

# evaluate on the dft train set
print(f"[Train_error] Evaluating EXP {MODEL_NAME} on EXP train set:")
metrics = exp_model.evaluate(exp_train_X, exp_train_y)
model_metrics.loc[len(model_metrics)] = {
'Model': MODEL_NAME,
'Error_Type': 'Train',
'Train_set': 'exp',
'Test_set': 'exp',
'R²': metrics['r2'],
'RMSE': metrics['rmse'],
'MAE': metrics['mae']
}

# evaluate on the dft test set
print(f"[Test EXP -> EXP] Evaluating EXP {MODEL_NAME} on EXP test set:")

metrics = exp_model.evaluate(exp_test_X, exp_test_y)
model_metrics.loc[len(model_metrics)] = {
'Model': MODEL_NAME,
'Error_Type': 'Test',
'Train_set': 'exp',
'Test_set': 'exp',
'R²': metrics['r2'],
'RMSE': metrics['rmse'],
'MAE': metrics['mae']
}
# evaluate on the exp test set
print(f"[Test EXP -> DFT] Evaluating EXP {MODEL_NAME} on DFT test set:")
metrics = exp_model.evaluate(dft_test_X, dft_test_y)
model_metrics.loc[len(model_metrics)] = {
'Model': MODEL_NAME,
'Error_Type': 'Test',
'Train_set': 'exp',
'Test_set': 'dft',
'R²': metrics['r2'],
'RMSE': metrics['rmse'],
'MAE': metrics['mae']
}

In [None]:
model_metrics.head()