In [5]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [6]:
def grid_search_models_cv_compare(X, y, scoring='r2', cv_list=[3, 5, 10]):
    """
    Melakukan GridSearchCV untuk Linear Regression, Random Forest, dan XGBoost
    dengan berbagai nilai CV untuk membandingkan performa.

    Parameters:
    - X : pd.DataFrame, fitur input
    - y : pd.Series/array, target output
    - scoring : str, metrik evaluasi (default='r2')
    - cv_list : list of int, daftar nilai split CV yang ingin diuji

    Returns:
    - results : dict berisi hasil tiap model dan tiap nilai CV
    """
    from sklearn.model_selection import GridSearchCV, KFold
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor
    from xgboost import XGBRegressor
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    results = {}

    for cv_split in cv_list:
        cv = KFold(n_splits=cv_split, shuffle=True, random_state=42)

        results[cv_split] = {}

        # 1. Linear Regression
        pipe_lr = Pipeline([
            ('scaler', StandardScaler()),
            ('model', LinearRegression())
        ])
        param_lr = {}
        grid_lr = GridSearchCV(pipe_lr, param_lr, cv=cv, scoring=scoring)
        grid_lr.fit(X, y)
        results[cv_split]['Linear Regression'] = {
            'Best Score': grid_lr.best_score_,
            'Best Params': grid_lr.best_params_
        }

        # 2. Random Forest
        pipe_rf = Pipeline([
            ('model', RandomForestRegressor(random_state=42))
        ])
        param_rf = {
            'model__n_estimators': [50, 100],
            'model__max_depth': [None, 5, 10],
            'model__min_samples_split': [2, 5],
            'model__min_samples_leaf': [1, 2]
        }
        grid_rf = GridSearchCV(pipe_rf, param_rf, cv=cv, scoring=scoring, n_jobs=-1)
        grid_rf.fit(X, y)
        results[cv_split]['Random Forest'] = {
            'Best Score': grid_rf.best_score_,
            'Best Params': grid_rf.best_params_
        }

        # 3. XGBoost
        pipe_xgb = Pipeline([
            ('model', XGBRegressor(objective='reg:squarederror', random_state=42, verbosity=0))
        ])
        param_xgb = {
            'model__n_estimators': [50, 100],
            'model__max_depth': [3, 5, 10],
            'model__learning_rate': [0.01, 0.1],
            'model__subsample': [0.7, 1],
            'model__colsample_bytree': [0.7, 1]
        }
        grid_xgb = GridSearchCV(pipe_xgb, param_xgb, cv=cv, scoring=scoring, n_jobs=-1)
        grid_xgb.fit(X, y)
        results[cv_split]['XGBoost'] = {
            'Best Score': grid_xgb.best_score_,
            'Best Params': grid_xgb.best_params_
        }

    return results


In [7]:
cv_results = grid_search_models_cv_compare(X, y, scoring='r2', cv_list=[3, 5, 10])

# Menampilkan hasil
for cv_split, models in cv_results.items():
    print(f"\n=== CV={cv_split} ===")
    for model_name, info in models.items():
        print(f"{model_name}: R2 = {info['Best Score']:.4f}, Params = {info['Best Params']}")


🚀 Running GridSearch with CV=3

🚀 Running GridSearch with CV=5

🚀 Running GridSearch with CV=10

=== CV=3 ===
Linear Regression: R2 = 0.8299, Params = {}
Random Forest: R2 = 0.9251, Params = {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
XGBoost: R2 = 0.9220, Params = {'model__colsample_bytree': 1, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 0.7}

=== CV=5 ===
Linear Regression: R2 = 0.8433, Params = {}
Random Forest: R2 = 0.9336, Params = {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
XGBoost: R2 = 0.9298, Params = {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 100, 'model__subsample': 0.7}

=== CV=10 ===
Linear Regression: R2 = 0.8432, Params = {}
Random Forest: R2 = 0.9170, Params = {'model__max_depth': 10, 'model__min_samples_leaf'