# Import Library

In [1]:
import src.util as util
import numpy as np
import pandas as pd
import copy
import hashlib
import json
import warnings

from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

from sklearn.model_selection import KFold, cross_val_score

warnings.filterwarnings('ignore')

In [2]:
config_data = util.load_config()

## <b>1. LOAD DATASET</b>
-----

In [4]:
def load_train_clean(config_data: pd.DataFrame) -> pd.DataFrame:
        X = util.pickle_load(config_data["train_set_clean"][0])
        y = util.pickle_load(config_data["train_set_clean"][1])

        return X, y
    
def load_valid_clean(config_data: pd.DataFrame) -> pd.DataFrame:
        X = util.pickle_load(config_data["valid_set_clean"][0])
        y = util.pickle_load(config_data["valid_set_clean"][1])

        return X, y

def load_test_clean(config_data: pd.DataFrame) -> pd.DataFrame:
        X = util.pickle_load(config_data["test_set_clean"][0])
        y = util.pickle_load(config_data["test_set_clean"][1])

        return X, y

In [5]:
X_train, y_train = load_train_clean(config_data)
X_valid, y_valid = load_valid_clean(config_data)
X_test, y_test = load_test_clean(config_data)

    Check for X_valid has same configuration and features as X_train

In [6]:
for config in X_train:
    if X_train[config].shape == X_valid[config].shape:
        if all(X_train[config].columns == X_valid[config].columns):
            print(f"X_valid {config} has the same configuration and features as X_train {config}.")
        else:
            print(f"X_valid {config} has different column names compared to X_train {config}.")
    else:
        print(f"X_valid {config} has a different shape compared to X_train {config}.")

X_valid filter has a different shape compared to X_train filter.
X_valid lasso has a different shape compared to X_train lasso.
X_valid rf has a different shape compared to X_train rf.


Check does mean of model could be enough to predict data.

<b>Reference:</b> <br>
[Metrics](https://coderzcolumn.com/tutorials/machine-learning/model-evaluation-scoring-metrics-scikit-learn-sklearn)

## <b> 2. Creating Training Log Template</b>

In [7]:
util.print_debug('Create training log template')

2023-07-14 22:45:26.050143 Create training log template


In [22]:
## Create training log function
def training_log_template() -> dict:
    # Debug message
    util.print_debug("creating training log template")

    # Template for training Log
    logger = {
        "model_name": [],
        "model_uid": [],
        "training_time": [],
        "training_date": [],
        "mse": [],
        "r2_score": [],
    }

    # Debug message
    util.print_debug("Training log template created")

    return logger

In [9]:
def training_log_updater(current_log: dict, params: dict) -> list:
    # create copy of current log
    current_log = copy.deepcopy(current_log)

    # Path for training log file
    log_path = params["training_log_path"]

    # Try to load training log file
    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()

    # If file not found create a new one
    except FileNotFoundError as fe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()

        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()

    # Add current log to previous log
    last_log.append(current_log)

    # Save updated log
    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    # Return log
    return last_log

In [24]:
## Create model object of ML model
def create_model_object(params: dict) -> list:
    # Debug message
    util.print_debug("Creating model objects.")

    # Create model objects
    baseline_knn = KNeighborsRegressor()
    baseline_dt = DecisionTreeRegressor()
    baseline_lr = LinearRegression()
    baseline_rf = RandomForestRegressor()
    baseline_ada = AdaBoostRegressor()
    baseline_gr = GradientBoostingRegressor()
    baseline_xgb = XGBRegressor()

    # Create list of model
    list_of_model = [
        {"model_name": baseline_knn.__class__.__name__, "model_object": baseline_knn, "model_uid": ""},
        {"model_name": baseline_dt.__class__.__name__, "model_object": baseline_dt, "model_uid": ""},
        {"model_name": baseline_lr.__class__.__name__, "model_object": baseline_lr, "model_uid": ""},
        {"model_name": baseline_rf.__class__.__name__, "model_object": baseline_rf, "model_uid": ""},
        {"model_name": baseline_ada.__class__.__name__, "model_object": baseline_ada, "model_uid": ""},
        {"model_name": baseline_gr.__class__.__name__, "model_object": baseline_gr, "model_uid": ""},
        {"model_name": baseline_xgb.__class__.__name__, "model_object": baseline_xgb, "model_uid": ""},
    ]

    # Debug message
    util.print_debug("Models object created")

    return list_of_model
    

## **3. Creating Baseline Modeling**

    Type I : List of model per config
    Use this to Create List of Model per Config (Filter, Lasso, Random Forest)

In [11]:
def train_eval(configuration_model: str, params: dict, hyperparams_model: list = None, data: str = 'filter'):

    # Variable to store trained models
    list_of_trained_model = dict()

    # Create training log template
    training_log = training_log_template()

    # Training for every data configuration
    util.print_debug("Training model based on current dataset")

    if hyperparams_model == None:
        list_of_model = create_model_object(params)
    else:
        list_of_model = copy.deepcopy(hyperparams_model)

    # Variable to store trained model
    trained_model = list()

    # Train each model by current dataset
    for model in list_of_model:
        # Debug message
        util.print_debug("Training model: {}".format(model["model_name"]))

        # Training
        training_time = util.time_stamp()
        model["model_object"].fit(X_train[data], y_train[data])
        training_time = (util.time_stamp() - training_time).total_seconds()

        # Debug message
        util.print_debug("Evaluating model: {}".format(model["model_name"]))

        # Evaluation
        y_predict = model["model_object"].predict(X_valid[data])
        mse = mean_squared_error(y_valid[data], y_predict, squared=False)
        r2 = r2_score(y_valid[data], y_predict)

        # Debug message
        util.print_debug("Logging: {}".format(model["model_name"]))

        # Create UID
        uid = hashlib.md5(str(training_time).encode()).hexdigest()

        model["model_uid"] = uid

        # Create training log data
        training_log["model_name"].append("{}--{}".format(configuration_model, model["model_name"]))
        training_log["model_uid"].append(uid)
        training_log["training_time"].append(training_time)
        training_log["training_date"].append(util.time_stamp())
        training_log["mse"].append(mse)
        training_log["r2_score"].append(r2)

        # Collenct current trained model
        trained_model.append(copy.deepcopy(model))

        # Debug Message
        util.print_debug("Model {} has been trained".format(model["model_name"]))

    # Debug message
    util.print_debug("All combination models and data has been trained.")


    return trained_model, training_log

    Type II : List of Model All
    Use this function to create model by ignoring config of data,
    This notebook use this function instead of above code

In [29]:
def train_eval(configuration_model: str, params: dict, hyperparams_model: list = None):

    # Variable to store trained models
    list_of_trained_model = dict()

    # Create training log template
    training_log = training_log_template()

    for config in X_train:
        # Debug message
        util.print_debug("Training model based on configuration data: {}".format(config))

        if hyperparams_model == None:
            list_of_model = create_model_object(params)
        else:
            list_of_model = copy.deepcopy(hyperparams_model)

        # Variable to store trained model
        trained_model = list()

        X_train_data = X_train[config]
        y_train_data = y_train[config]
        X_valid_data = X_valid[config]
        y_valid_data = y_valid[config]

        # Train each model by current dataset
        for model in list_of_model:
            # Debug message
            util.print_debug("Training model: {}".format(model["model_name"]))

            # Training
            training_time = util.time_stamp()
            model["model_object"].fit(X_train_data, y_train_data)
            training_time = (util.time_stamp() - training_time).total_seconds()

            # Debug message
            util.print_debug("Evaluating model: {}".format(model["model_name"]))

            # Evaluation
            y_predict = model["model_object"].predict(X_valid_data)
            mse = mean_squared_error(y_valid_data, y_predict, squared=False)
            r2 = r2_score(y_valid_data, y_predict)

            # Debug message
            util.print_debug("Logging: {}".format(model["model_name"]))

            # Create UID
            uid = hashlib.md5(str(training_time).encode()).hexdigest()

            model["model_uid"] = uid

            # Create training log data
            training_log["model_name"].append("{}-{}-{}".format(configuration_model, config, model["model_name"]))
            training_log["model_uid"].append(uid)
            training_log["training_time"].append(training_time)
            training_log["training_date"].append(util.time_stamp())
            training_log["mse"].append(mse)
            training_log["r2_score"].append(r2)

            # Collenct current trained model
            trained_model.append(copy.deepcopy(model))

            # Debug Message
            util.print_debug("Model {} has been trained".format(model["model_name"]))
            util.print_debug("-"*40)

        # Collect current trained list of model
        list_of_trained_model[config] = copy.deepcopy(trained_model)
        util.print_debug("="*40)

    # Debug message
    util.print_debug("All combination models and data has been trained.")


    return list_of_trained_model, training_log

In [30]:
list_of_trained_model, training_log = train_eval("baseline", config_data)

2023-07-14 23:11:15.007524 creating training log template
2023-07-14 23:11:15.008591 Training log template created
2023-07-14 23:11:15.008591 Training model based on configuration data: filter
2023-07-14 23:11:15.008591 Creating model objects.
2023-07-14 23:11:15.008591 Models object created
2023-07-14 23:11:15.008591 Training model: KNeighborsRegressor
2023-07-14 23:11:15.017689 Evaluating model: KNeighborsRegressor


2023-07-14 23:11:16.012186 Logging: KNeighborsRegressor
2023-07-14 23:11:16.015497 Model KNeighborsRegressor has been trained
2023-07-14 23:11:16.015497 ----------------------------------------
2023-07-14 23:11:16.015497 Training model: DecisionTreeRegressor
2023-07-14 23:11:16.551140 Evaluating model: DecisionTreeRegressor
2023-07-14 23:11:16.557724 Logging: DecisionTreeRegressor
2023-07-14 23:11:16.558752 Model DecisionTreeRegressor has been trained
2023-07-14 23:11:16.558752 ----------------------------------------
2023-07-14 23:11:16.558752 Training model: LinearRegression
2023-07-14 23:11:16.584584 Evaluating model: LinearRegression
2023-07-14 23:11:16.589587 Logging: LinearRegression
2023-07-14 23:11:16.589587 Model LinearRegression has been trained
2023-07-14 23:11:16.589587 ----------------------------------------
2023-07-14 23:11:16.589587 Training model: RandomForestRegressor
2023-07-14 23:11:50.640981 Evaluating model: RandomForestRegressor
2023-07-14 23:11:50.809356 Logging

## <b>4. Save Training Log and Production Model</b>

In [31]:
def get_production_model(list_of_model, training_log, params):
    # Create copy list of model
    list_of_model = copy.deepcopy(list_of_model)
    
    # Debug message
    util.print_debug("Choosing model by metrics score.")

    # Create required predefined variabel
    curr_production_model = None
    prev_production_model = None
    production_model_log = None

    # Debug message
    util.print_debug("Converting training log type of data from dict to dataframe.")

    # Convert dictionary to pandas for easy operation
    training_log = pd.DataFrame(copy.deepcopy(training_log))

    # Debug message
    util.print_debug("Trying to load previous production model.")

    # Check if there is a previous production model
    try:
        prev_production_model = util.pickle_load(params["production_model_path"])
        util.print_debug("Previous production model loaded.")

    except FileNotFoundError as fe:
        util.print_debug("No previous production model detected, choosing best model only from current trained model.")

    # If previous production model detected:
    if prev_production_model != None:
        # Debug message
        util.print_debug("Loading validation data.")
        X_valid['filter'], y_valid['filter']
        
        # Debug message
        util.print_debug("Checking compatibilty previous production model's input with current train data's features.")

        # Check list features of previous production model and current dataset
        production_model_features = set(prev_production_model["model_data"]["model_object"].feature_names_in_)
        current_dataset_features = set(X_valid['filter'].columns)
        number_of_different_features = len((production_model_features - current_dataset_features) | (current_dataset_features - production_model_features))

        # If feature matched:
        if number_of_different_features == 0:
            # Debug message
            util.print_debug("Features compatible.")

            # Debug message
            util.print_debug("Reassesing previous model performance using current validation data.")

            # Re-predict previous production model to provide valid metrics compared to other current models
            y_pred = prev_production_model["model_data"]["model_object"].predict(X_valid['filter'])

            # Re-asses prediction result
            eval_res = mean_squared_error(y_valid['filter'], y_pred, squared = False)
            eval_r2 = r2_score(y_valid['filter'], y_pred)

            # Debug message
            util.print_debug("Assessing complete.")

            # Debug message
            util.print_debug("Storing new metrics data to previous model structure.")

            # Update their performance log
            prev_production_model["model_log"]["mse"] = eval_res
            prev_production_model["model_log"]["r2_score"] = eval_r2

            # Debug message
            util.print_debug("Adding previous model data to current training log and list of model")

            # Added previous production model log to current logs to compere who has the greatest f1 score
            training_log = pd.concat([training_log, pd.DataFrame([prev_production_model["model_log"]])])

            # Added previous production model to current list of models to choose from if it has the greatest f1 score
            list_of_model["prev_production_model"] = [copy.deepcopy(prev_production_model["model_data"])]
        else:
            # To indicate that we are not using previous production model
            prev_production_model = None

            # Debug message
            util.print_debug("Different features between production model with current dataset is detected, ignoring production dataset.")

    # Debug message
    util.print_debug("Sorting training log by f1 macro avg and training time.")

    # Sort training log by f1 score macro avg and trining time
    best_model_log = training_log.sort_values(["mse", "training_time"], ascending = [True, True]).iloc[0]
    
    # Debug message
    util.print_debug("Searching model data based on sorted training log.")

    # Get model object with greatest f1 score macro avg by using UID
    for configuration_data in list_of_model:
        for model_data in list_of_model[configuration_data]:
            if model_data["model_uid"] == best_model_log["model_uid"]:
                curr_production_model = dict()
                curr_production_model["model_data"] = copy.deepcopy(model_data)
                curr_production_model["model_log"] = copy.deepcopy(best_model_log.to_dict())
                curr_production_model["model_log"]["model_name"] = "Filter-{}".format(curr_production_model["model_data"]["model_name"])
                curr_production_model["model_log"]["training_date"] = str(curr_production_model["model_log"]["training_date"])
                production_model_log = training_log_updater(curr_production_model["model_log"], params)
                break
    
    # In case UID not found
    if curr_production_model == None:
        raise RuntimeError("The best model not found in your list of model.")
    
    # Debug message
    util.print_debug("Model chosen.")

    # Dump chosen production model
    util.pickle_dump(curr_production_model, params["production_model_path"])
    
    # Return current chosen production model, log of production models and current training log
    return curr_production_model, production_model_log, training_log

In [32]:
model, production_model_log, training_logs = get_production_model(list_of_trained_model, training_log, config_data)

2023-07-14 23:14:46.098924 Choosing model by metrics score.
2023-07-14 23:14:46.100181 Converting training log type of data from dict to dataframe.
2023-07-14 23:14:46.103225 Trying to load previous production model.
2023-07-14 23:14:46.192159 Previous production model loaded.
2023-07-14 23:14:46.192159 Loading validation data.
2023-07-14 23:14:46.192159 Checking compatibilty previous production model's input with current train data's features.
2023-07-14 23:14:46.192159 Features compatible.
2023-07-14 23:14:46.192159 Reassesing previous model performance using current validation data.
2023-07-14 23:14:46.359175 Assessing complete.
2023-07-14 23:14:46.360198 Storing new metrics data to previous model structure.
2023-07-14 23:14:46.360198 Adding previous model data to current training log and list of model
2023-07-14 23:14:46.387205 Sorting training log by f1 macro avg and training time.
2023-07-14 23:14:46.389128 Searching model data based on sorted training log.
2023-07-14 23:14:46.42

## <b>5. Model Performance</b>

In [34]:
pd.set_option('display.max_colwidth', None)
# pd.reset_option('display.max_colwidth')
training_logs[["model_name", "mse", "r2_score", "training_time"]].sort_values(["mse", "r2_score","training_time"], 
                                                                               ascending=[True, True, True])\
                                                                                   .head(10)

Unnamed: 0,model_name,mse,r2_score,training_time
0,Filter-RandomForestRegressor,1.008912,0.99888,21.635677
3,baseline-filter-RandomForestRegressor,1.031461,0.998829,34.051394
10,baseline-lasso-RandomForestRegressor,1.111917,0.99864,26.013328
6,baseline-filter-XGBRegressor,1.266875,0.998234,3.025813
8,baseline-lasso-DecisionTreeRegressor,1.308961,0.998115,0.425583
13,baseline-lasso-XGBRegressor,1.378939,0.997908,2.395559
1,baseline-filter-DecisionTreeRegressor,1.763415,0.996578,0.535643
12,baseline-lasso-GradientBoostingRegressor,19.642598,0.575457,10.488397
5,baseline-filter-GradientBoostingRegressor,20.202004,0.550931,12.755331
0,baseline-filter-KNeighborsRegressor,24.25241,0.352807,0.009098


Based on the baseline model evaluation, the <b> <i>Filter method</i> applied on <mark><i>Random Forest Regression</i> appears to be the best model. </mark></b> <br> 
However, it is worth noting that <b>this model takes <mark>more time for predictions</mark>.</b> If training time is a significant consideration, alternative methods such as <b><i>Lasso Method</i> on <i>Decision Tree Regression</i> or <i>XGBoost Regressor</i> </b> could be viable options.

It is important to mention that <b><i>Decision Tree Regression</i> may result in a high variance model, potentially leading to overfitting.</b> To assess the model's performance on the test set, further evaluation should be conducted. Nevertheless, <i>Decision Tree</i> models are relatively easier to interpret due to their inherent structure.

On the other hand, if <b>the objective is to minimize error within a <mark>shorter amount of time, <i>XGBoost Regression</i> is the recommended choice</mark></b>. However, it is worth noting that XGBoost models are generally <b>more complex and can be more challenging to interpret.</b>

Ultimately, the choice of the model depends on the specific requirements and trade-offs between factors such as <b>accuracy, interpretability, training time, and ease of use.</b>

## <b>6. Cross Validation Score</b>

In [35]:
list_of_model = create_model_object(config_data)

2023-07-14 23:16:03.861509 Creating model objects.
2023-07-14 23:16:03.862016 Models object created


<b>Cross Validation score (CVS)</b>
----

<mark><b>CVS</mark> is performed to understand the distribution of data that we can be sure out model generalises well accross the whole dataset and not just a single portion.</b> <br>

    How do we now that single dataset is representative?

Cross Val Score train and test our model on <mark><b>multiple folds</b></mark> and give a better understanding of model performance over the whole dataset instead of just a single train/test split. <br>

If we see that the metrics for all folds cvs are significant differences between them then this <b>may indicate over-fitting to certain folds.</b> <br>

<b>!! Scoring: </b> <br>
----

<b><i>neg_mean_squared_error</i></b> always <mark><b>return negative (-)</b></mark>, it because cross_val_score function will <b>return maximize value as sign higher is better, the cross_val_score will turn it into negative (-), </b><br>
hence, cross_val_score will return the smaller value.

<b>As example of,</b> <br>
MSE Score 5 is better than 9. <br>
Cross val score will return the higher which is 9. <br>
As of that, cross_val_score function will turn it into -5 and -9, and <br>
cross_val_score will return -5 as the higher value. <br>

**Reference:**<br>
[cross_val_score](https://scikit-learn.org/stable/modules/model_evaluation.html)

In [36]:
model_object = []
model_name = []

for model in list_of_model:
    model_object.append(model["model_object"])
    model_name.append(model["model_name"])

cv = KFold(n_splits=5)

for index, model in enumerate(model_object):
    cvs = cross_val_score(estimator=model, X=X_train['filter'], 
                          y=y_train['filter'], 
                          cv=cv, 
                          scoring='neg_root_mean_squared_error')
    mean = np.round(cvs.mean(), 3)
    std = np.round(cvs.std(), 3)
    print(f"cross validation score for the model {model_name[index]} is {np.abs(mean)} +/- {std}.")



cross validation score for the model KNeighborsRegressor is -25.007 +/- 0.246.
cross validation score for the model DecisionTreeRegressor is -1.34 +/- 0.164.
cross validation score for the model LinearRegression is -29.54 +/- 0.18.
cross validation score for the model RandomForestRegressor is -1.039 +/- 0.138.
cross validation score for the model AdaBoostRegressor is -28.665 +/- 0.224.
cross validation score for the model GradientBoostingRegressor is -19.941 +/- 0.24.
cross validation score for the model XGBRegressor is -1.3 +/- 0.089.


    Cross Val Score show Decision Tree Regressor, Random Forest Regressor, and XGBoost Regressor indicates the dataset is generalised.

## <b>7. Perform Hyperparameter for Model</b>

### Create Param Distribution

Define Best parameter for each model: 

In [45]:
def create_dist_params(model_name:str) -> dict:
    dist_params_dt = {
        "criterion" : ['squared_error', 'absolute_error'],
        "min_samples_split" : np.arange(3,10,1).tolist(),
        "min_samples_leaf" : np.arange(2,10,1).tolist(),
        "max_depth" : np.arange(50,501,50).tolist(),
        "random_state" : [42]
    }

    dist_params_rf = {
        "n_estimators" : np.arange(100,500,100).tolist(),
        "criterion" : ["squared_error", "absolute_error"],
        "max_depth" : np.arange(2,11,1).tolist(),
        "min_samples_split" : np.arange(1,10,1).tolist(),
        "random_state" : [42]
    }

    dist_params_xgb = {
        "eta" : np.arange(0.1,1,0.1).tolist(),
        "max_depth" : np.arange(1,11,1).tolist(),
        "alpha" : np.arange(1,10,1).tolist(),
        "random_state" : [42]
    }

    dist_params = {
        "DecisionTreeRegressor" : dist_params_dt,
        "RandomForestRegressor" : dist_params_rf,
        "XGBRegressor" : dist_params_xgb
    }

    return dist_params[model_name]

### Create Hyper Param Function into one dict

In [37]:
def hyper_params_tuning(model: list) -> list:
    # Create copy of current best baseline model
    model_list = []
    trained_model = [DecisionTreeRegressor(),
                     RandomForestRegressor(),
                     XGBRegressor()]
    
    for col, mod in list(zip(model, trained_model)):
        dist_params = create_dist_params(col)
        model_rsc = RandomizedSearchCV(
            estimator = mod,
            param_distributions = dist_params,
            cv = cv,
            scoring = 'neg_root_mean_squared_error',
            n_jobs=-1
        )
        
        model = {
            "model_name" : col,
            "model_object" : model_rsc,
            "model_uid" : ""
        }

        model_list.append(model.copy())

    return model_list

In [46]:
list_model_ = hyper_params_tuning(['DecisionTreeRegressor',
                                   'RandomForestRegressor',
                                   'XGBRegressor'])

list_model_

[{'model_name': 'DecisionTreeRegressor',
  'model_object': RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                     estimator=DecisionTreeRegressor(), n_jobs=-1,
                     param_distributions={'criterion': ['squared_error',
                                                        'absolute_error'],
                                          'max_depth': [50, 100, 150, 200, 250,
                                                        300, 350, 400, 450, 500],
                                          'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                               8, 9],
                                          'min_samples_split': [3, 4, 5, 6, 7, 8,
                                                                9],
                                          'random_state': [42]},
                     scoring='neg_root_mean_squared_error'),
  'model_uid': ''},
 {'model_name': 'RandomForestRegressor',
 

    Copy list of trained model

In [47]:
list__model = copy.deepcopy(list_of_trained_model)

    Update list of trained model with hyperparams model

In [48]:
for config in list__model:
    for l in (list_model_[0], list_model_[1], list_model_[2]):
        list__model[config].append(l)

In [49]:
for model in list__model['filter']:
    print(model['model_name'])

KNeighborsRegressor
DecisionTreeRegressor
LinearRegression
RandomForestRegressor
AdaBoostRegressor
GradientBoostingRegressor
XGBRegressor
DecisionTreeRegressor
RandomForestRegressor
XGBRegressor


    Check the parameter distribution we added into list__model

In [60]:
list__model['filter'][-3:]

[{'model_name': 'DecisionTreeRegressor',
  'model_object': RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                     estimator=DecisionTreeRegressor(), n_jobs=-1,
                     param_distributions={'criterion': ['squared_error',
                                                        'absolute_error'],
                                          'max_depth': [50, 100, 150, 200, 250,
                                                        300, 350, 400, 450, 500],
                                          'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                               8, 9],
                                          'min_samples_split': [3, 4, 5, 6, 7, 8,
                                                                9],
                                          'random_state': [42]},
                     scoring='neg_root_mean_squared_error'),
  'model_uid': ''},
 {'model_name': 'RandomForestRegressor',
 

### <b> Run Hyperparamater Model </b>

<b><mark>!! Be Aware this function takes long times to finish </b><mark>

In [61]:
def train_eval_hyper(configuration_model: str, params: dict, hyperparams_model: list = None, log_path: str = None):

    # Variable to store trained models
    list_of_trained_model = dict()

    # Create training log template
    training_log = training_log_template()

    for config in X_train:
        # Debug message
        util.print_debug("Training model based on configuration data: {}".format(config))

        if hyperparams_model == None:
            list_of_model = create_model_object(params)
        else:
            ## list of hyperparams is three models behind (look list__model[config])
            list_of_model = copy.deepcopy(hyperparams_model[config][-3:])

        # Variable to store trained model
        trained_model = list()

        X_train_data = X_train[config]
        y_train_data = y_train[config]
        X_valid_data = X_valid[config]
        y_valid_data = y_valid[config]

        # Train each model by current dataset
        for model in list_of_model:
            # Debug message
            util.print_debug("Training model: {}".format(model["model_name"]))

            # Training
            training_time = util.time_stamp()
            model["model_object"].fit(X_train_data, y_train_data)
            training_time = (util.time_stamp() - training_time).total_seconds()

            # Debug message
            util.print_debug("Evaluating model: {}".format(model["model_name"]))

            # Evaluation
            y_predict = model["model_object"].predict(X_valid_data)
            mse = mean_squared_error(y_valid_data, y_predict, squared=False)
            r2 = r2_score(y_valid_data, y_predict)

            # Debug message
            util.print_debug("Logging: {}".format(model["model_name"]))

            # Create UID
            uid = hashlib.md5(str(training_time).encode()).hexdigest()

            model["model_uid"] = uid

            # Create training log data
            training_log["model_name"].append("{}-{}-{}".format(configuration_model, config, model["model_name"]))
            training_log["model_uid"].append(uid)
            training_log["training_time"].append(training_time)
            training_log["training_date"].append(util.time_stamp())
            training_log["mse"].append(mse)
            training_log["r2_score"].append(r2)

            # Collenct current trained model
            trained_model.append(copy.deepcopy(model))

            # Debug Message
            util.print_debug("Model {} has been trained".format(model["model_name"]))

        # Collect current trained list of model
        list_of_trained_model[config] = copy.deepcopy(trained_model)

    # Debug message
    util.print_debug("All combination models and data has been trained.")

    if log_path == None:
        training_log_ = training_log
    else:
        training_log_ = training_log_updater(training_log, log_path)

    return list_of_trained_model, training_log_

In [62]:
list_of_hyperparam_model, training_log_ = train_eval_hyper("hyperparam", 
                                                           config_data, 
                                                           list__model, 
                                                           config_data['training_log_path'])

2023-07-15 00:20:37.162309 creating training log template
2023-07-15 00:20:37.162309 Training log template created
2023-07-15 00:20:37.162309 Training model based on configuration data: filter
2023-07-15 00:20:37.163818 Training model: DecisionTreeRegressor
2023-07-15 00:29:44.269418 Evaluating model: DecisionTreeRegressor
2023-07-15 00:29:44.272417 Logging: DecisionTreeRegressor
2023-07-15 00:29:44.274496 Model DecisionTreeRegressor has been trained
2023-07-15 00:29:44.274496 Training model: RandomForestRegressor


## <b>8. Try the MODEL to Test Dataset</b>

In [None]:
def train_eval(configuration_model: str, params: dict, hyperparams_model: list = None,
               X_t: pd.DataFrame = pd.DataFrame(), y_t: pd.DataFrame = pd.DataFrame()):

    # Variable to store trained models
    list_of_trained_model = dict()

    # Create training log template
    training_log = training_log_template()

    for config in X_train:
        # Debug message
        util.print_debug("Training model based on configuration data: {}".format(config))

        if hyperparams_model == None:
            list_of_model = create_model_object(params)
        else:
            list_of_model = copy.deepcopy(hyperparams_model)

        # Variable to store trained model
        trained_model = list()

        X_train_data = X_train[config]
        y_train_data = y_train[config]
        X_valid_data = X_t[config]
        y_valid_data = y_t[config]

        # Train each model by current dataset
        for model in list_of_model:
            # Debug message
            util.print_debug("Training model: {}".format(model["model_name"]))

            # Training
            training_time = util.time_stamp()
            model["model_object"].fit(X_train_data, y_train_data)
            training_time = (util.time_stamp() - training_time).total_seconds()

            # Debug message
            util.print_debug("Evaluating model: {}".format(model["model_name"]))

            # Evaluation
            y_predict = model["model_object"].predict(X_valid_data)
            mse = mean_squared_error(y_valid_data, y_predict, squared=False)
            r2 = r2_score(y_valid_data, y_predict)

            # Debug message
            util.print_debug("Logging: {}".format(model["model_name"]))

            # Create UID
            uid = hashlib.md5(str(training_time).encode()).hexdigest()

            model["model_uid"] = uid

            # Create training log data
            training_log["model_name"].append("{}-{}-{}".format(configuration_model, config, model["model_name"]))
            training_log["model_uid"].append(uid)
            training_log["training_time"].append(training_time)
            training_log["training_date"].append(util.time_stamp())
            training_log["mse"].append(mse)
            training_log["r2_score"].append(r2)

            # Collenct current trained model
            trained_model.append(copy.deepcopy(model))

            # Debug Message
            util.print_debug("Model {} has been trained".format(model["model_name"]))
            util.print_debug("-"*40)

        # Collect current trained list of model
        list_of_trained_model[config] = copy.deepcopy(trained_model)
        util.print_debug("="*40)

    # Debug message
    util.print_debug("All combination models and data has been trained.")


    return list_of_trained_model, training_log

In [53]:
list_of_test_model, testing_log = train_eval("baseline", config_data, 
                                             X_t=X_test, 
                                             y_t=y_test)

2023-07-14 23:28:14.217449 creating training log template
2023-07-14 23:28:14.217449 Training log template created
2023-07-14 23:28:14.217449 Training model based on configuration data: filter
2023-07-14 23:28:14.217449 Creating model objects.
2023-07-14 23:28:14.217449 Models object created
2023-07-14 23:28:14.217449 Training model: KNeighborsRegressor
2023-07-14 23:28:14.221955 Evaluating model: KNeighborsRegressor
2023-07-14 23:28:15.085797 Logging: KNeighborsRegressor
2023-07-14 23:28:15.089801 Model KNeighborsRegressor has been trained
2023-07-14 23:28:15.089801 Training model: DecisionTreeRegressor
2023-07-14 23:28:15.682975 Evaluating model: DecisionTreeRegressor
2023-07-14 23:28:15.692243 Logging: DecisionTreeRegressor
2023-07-14 23:28:15.692753 Model DecisionTreeRegressor has been trained
2023-07-14 23:28:15.692753 Training model: LinearRegression
2023-07-14 23:28:15.723696 Evaluating model: LinearRegression
2023-07-14 23:28:15.730756 Logging: LinearRegression
2023-07-14 23:28

### <b>Show Best Performance Model </b>

In [54]:
pd.DataFrame(testing_log)[['model_name',
                           'training_time',
                           'mse',
                           'r2_score']]\
.sort_values(['mse', 'r2_score', 'training_time'],
             ascending=[True,True,True])\
.head(5)

Unnamed: 0,model_name,training_time,mse,r2_score
10,baseline-lasso-RandomForestRegressor,29.038199,0.817855,0.999256
3,baseline-filter-RandomForestRegressor,37.377706,0.962922,0.998969
8,baseline-lasso-DecisionTreeRegressor,0.463786,0.9684,0.998957
13,baseline-lasso-XGBRegressor,2.130459,1.115612,0.998616
6,baseline-filter-XGBRegressor,3.24182,1.16849,0.998482


## <b> Check Final Model </b>

In [59]:
final_model = util.pickle_load(config_data["production_model_path"])

final_model

{'model_data': {'model_name': 'RandomForestRegressor',
  'model_object': RandomForestRegressor(),
  'model_uid': '7fb91c0e0bf0ac7694a7dfbf32a743ae'},
 'model_log': {'model_name': 'Filter-RandomForestRegressor',
  'model_uid': '7fb91c0e0bf0ac7694a7dfbf32a743ae',
  'training_time': 21.635677,
  'training_date': '2023-07-14 22:49:32.996386',
  'mse': 1.0089119124699573,
  'r2_score': 0.9988799674630775,
  'rmse': 1.0179032471237865}}

<b>Best Model</b><br>
-----------

Best model performance based on <b>validation data is <mark><i>Random Forest Regressor</i></mark> on <i>Filter Data Configuration</i>,</b><br>
Show up with <b>MSE Score = 1.023</b> and <b>R2_Score = 0.998</b> <br>
However, it defent on <b>training time: 47.97s</b>

If you prefer more fast training time with nearly score, you can choose:
<b><i>Random Forest Regressor</i> on <i>Lasso Data Configuration</i></b><br>

----------