In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd

def grid_search_models(X, y, cv_splits=5, scoring='r2'):
    results = {}

    cv = KFold(n_splits=cv_splits, shuffle=True, random_state=42)

    # 1. Linear Regression (pakai pipeline untuk scaling)
    pipe_lr = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])
    param_lr = {
        # Tidak banyak parameter di LinearRegression; scaling cukup penting
    }

    grid_lr = GridSearchCV(pipe_lr, param_lr, cv=cv, scoring=scoring)
    grid_lr.fit(X, y)
    results['Linear Regression'] = {
        'Best Score': grid_lr.best_score_,
        'Best Params': grid_lr.best_params_,
        'Best Estimator': grid_lr.best_estimator_
    }

    # 2. Random Forest
    pipe_rf = Pipeline([
        ('model', RandomForestRegressor(random_state=42))
    ])
    param_rf = {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }

    grid_rf = GridSearchCV(pipe_rf, param_rf, cv=cv, scoring=scoring, n_jobs=-1)
    grid_rf.fit(X, y)
    results['Random Forest'] = {
        'Best Score': grid_rf.best_score_,
        'Best Params': grid_rf.best_params_,
        'Best Estimator': grid_rf.best_estimator_
    }

    # 3. XGBoost
    pipe_xgb = Pipeline([
        ('model', XGBRegressor(objective='reg:squarederror', random_state=42, verbosity=0))
    ])
    param_xgb = {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [3, 5, 10],
        'model__learning_rate': [0.01, 0.1, 0.3],
        'model__subsample': [0.7, 1],
        'model__colsample_bytree': [0.7, 1]
    }

    grid_xgb = GridSearchCV(pipe_xgb, param_xgb, cv=cv, scoring=scoring, n_jobs=-1)
    grid_xgb.fit(X, y)
    results['XGBoost'] = {
        'Best Score': grid_xgb.best_score_,
        'Best Params': grid_xgb.best_params_,
        'Best Estimator': grid_xgb.best_estimator_
    }

    return results

In [None]:
df = pd.read_csv('CarPrice_Assignment_cleaned.csv')
X = df.drop(columns=['price'])
y = df['price']
if __name__ == "__main__":
    results = grid_search_models(X, y)
    for model, result in results.items():
        print(f"{model}:\n"
              f"  Best Score: {result['Best Score']}\n"
              f"  Best Params: {result['Best Params']}\n"
              f"  Best Estimator: {result['Best Estimator']}\n")