In [6]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
warnings.filterwarnings('ignore', category=UserWarning)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.max_rows', 250)

# Random Forest
    # n_estimators: It defines the number of trees in the forest.
    # max_features: Limits the number of features to consider when splitting a node.
    # max_depth: Controls the maximum depth of each tree.
    # max_leaf_nodes: Limits the number of leaf nodes in the tree hence controlling its size and complexity.
    # max_sample: Apart from the features, we have a large set of training datasets.
    # min_sample_split: Specifies the minimum number of samples required to split an internal node

# XGBoost
    # learning_rate: Step size shrinkage
    # max_depth: Max tree depth
    # n_estimators: Number of trees
    # subsample: Fraction of data used per tree
    # colsample_bytree: Fraction of features per tree
    # reg_alpha: L1 regularization
    # reg_lambda: L2 regularization
    # gamma: Minimum loss reduction to make further partition
    # booster: Type of booster: gbtree, gblinear, dart
    # tree_method: Can be 'exact', 'approx', 'hist'
    # scale_pos_weight: Useful for imbalanced data
    # early_stopping_rounds: Stops training early if no improvement.
    # eval_metric: Evaluation metric (e.g., 'auc', 'logloss').

In [None]:
def grid_search_models(X, y, cv_splits=5, scoring='r2', Model='All'):
    """
    Melakukan grid search untuk model Random Forest dan/atau XGBoost.

    Parameters:
    - X: Fitur input (DataFrame)
    - y: Target variabel
    - cv_splits: Jumlah CV split (default: 5)
    - scoring: Metode evaluasi (default: 'r2')
    - Model: 'RF', 'XGB', atau 'All' (default: 'All')

    Returns:
    - Dictionary dengan hasil grid search dari model-model yang dipilih
    """
    results = {}

    cv = KFold(n_splits=cv_splits, shuffle=True, random_state=42)

    if Model in ['RF', 'All']:
        pipe_rf = Pipeline([
            ('model', RandomForestRegressor(random_state=42))
        ])
        param_rf = {
            'model__n_estimators': [50, 100, 200],
            'model__max_features': ['sqrt', 'log2'],
            'model__max_depth': [None, 10, 20, 30],
            'model__max_leaf_nodes': [None, 10, 20],
            'model__max_samples': [None, 0.5, 0.7],
            'model__min_samples_split': [2, 5, 10]
        }

        grid_rf = GridSearchCV(pipe_rf, param_rf, cv=cv, scoring=scoring, n_jobs=-1)
        grid_rf.fit(X, y)
        results['Random Forest'] = {
            'Best Score': grid_rf.best_score_,
            'Best Params': grid_rf.best_params_,
            'Best Estimator': grid_rf.best_estimator_
        }
        print("Random Forest Grid Search Completed")

    if Model in ['XGB', 'All']:
        pipe_xgb = Pipeline([
            ('model', XGBRegressor(objective='reg:squarederror', random_state=42, verbosity=0))
        ])
        param_xgb = {
            'model__learning_rate': [0.01, 0.1, 0.3],
            'model__max_depth': [3, 7],
            'model__n_estimators': [50, 100],
            'model__subsample': [0.7, 1.0],
            'model__colsample_bytree': [0.7, 1.0],
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [0, 0.1]
        }

        grid_xgb = GridSearchCV(pipe_xgb, param_xgb, cv=cv, scoring=scoring, n_jobs=-1)
        grid_xgb.fit(X, y)
        results['XGBoost'] = {
            'Best Score': grid_xgb.best_score_,
            'Best Params': grid_xgb.best_params_,
            'Best Estimator': grid_xgb.best_estimator_
        }
        print("XGBoost Grid Search Completed")

    return results


# Alasan Pemilihan Fitur
Berdasarkan EDA Tentang Korelasi Fitur kita mendapatkan bahwa ada 9 Fitur yang dapat nilai korelasinya diatas 0.5 yaitu
Drivewheel, Wheelbase, Carlength, Carwidth, Curbwheight, Enginesize, Fuelsystem, Boreratio,Horsepower

In [8]:
df = pd.read_csv('../data/CarPrice_Assignment_cleaned.csv')
X = df[['drivewheel', 'wheelbase', 'carlength', 'carwidth', 'curbweight',
         'enginesize', 'fuelsystem', 'boreratio', 'horsepower']]
y = df['price']

resultsRF = grid_search_models(X, y, Model='RF')
resultsXGB = grid_search_models(X, y, Model='XGB')

summary = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Best Score': [resultsRF['Random Forest']['Best Score'], resultsXGB['XGBoost']['Best Score']],
    'Best Params': [resultsRF['Random Forest']['Best Params'], resultsXGB['XGBoost']['Best Params']]
})

# Tampilkan tabel
print("\n=== Summary of Grid Search Results ===")
print(summary.to_markdown(index=False))

Random Forest Grid Search Completed
XGBoost Grid Search Completed

=== Summary of Grid Search Results ===
| Model         |   Best Score | Best Params                                                                                                                                                                             |
|:--------------|-------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Random Forest |     0.923629 | {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__max_leaf_nodes': None, 'model__max_samples': None, 'model__min_samples_split': 2, 'model__n_estimators': 50}          |
| XGBoost       |     0.921015 | {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 50, 'model__reg_alpha': 0, 'model__reg_lambda': 0, 'model__subsample': 0.7} |


In [10]:
import mlflow
with mlflow.start_run():
    mlflow.log_params({
        'cv_splits': 5,
        'scoring': 'r2',
        'Model': 'All'
    })
    mlflow.log_metric('Random Forest Best Score', resultsRF['Random Forest']['Best Score'])
    mlflow.log_param('Random Forest Best Params', resultsRF['Random Forest']['Best Params'])
    mlflow.log_metric('XGBoost Best Score', resultsXGB['XGBoost']['Best Score'])
    mlflow.log_param('XGBoost Best Params', resultsXGB['XGBoost']['Best Params'])
    
    # Autologging untuk menyimpan model
mlflow.autolog()

2025/07/29 20:33:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/29 20:33:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
