In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib
import warnings
warnings.filterwarnings('ignore', category=UserWarning)


In [61]:
# Load dataset
df = pd.read_csv("../data/CarPrice_Assignment_cleaned.csv")
target = 'price'

# Gunakan hanya fitur yang konsisten dan mudah diinput manual
# Drivewheel, Wheelbase, Carlength, Carwidth, Curbwheight, Enginesize, Fuelsystem, Boreratio,Horsepower
features = [
    'drivewheel', 
    'wheelbase', 
    'carlength', 
    'carwidth', 
    'curbweight', 
    'enginesize', 
    'fuelsystem', 
    'boreratio', 
    'horsepower'
]

X = df[features]
y = df[target]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Alasan Pemilihan Parameter
Berdasarkan Percobaan di Tes4 dengan GridSearch untuk menentukan parameter dengan R2 terbaik

=== Summary of Grid Search Results ===
| Model         |   Best Score | Best Params                                                                                                                                                                             |
|:--------------|-------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Random Forest |     0.923629 | {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__max_leaf_nodes': None, 'model__max_samples': None, 'model__min_samples_split': 2, 'model__n_estimators': 50}          |
| XGBoost       |     0.921015 | {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 50, 'model__reg_alpha': 0, 'model__reg_lambda': 0, 'model__subsample': 0.7} |

In [62]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(max_depth=None, 
                                    max_features='sqrt',max_leaf_nodes=None,
                                    max_samples=None,min_samples_split=2,
                                    n_estimators=50, random_state=42))
])

rf_cv_scores = cross_val_score(rf_pipeline, X, y, cv=kf, scoring='r2')
print(f"Random Forest CV R2: {np.mean(rf_cv_scores):.4f}")

rf_pipeline.fit(X, y)
joblib.dump(rf_pipeline, '../model/model_rf.pkl')


Random Forest CV R2: 0.9240


['../model/model_rf.pkl']

In [63]:
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBRegressor(objective='reg:squarederror', verbosity=0, random_state=42, col_sample_bytree=0.7,
                            learning_rate=0.1, max_depth=7, n_estimators=50, reg_alpha=0, reg_lambda=0, subsample=0.7))
])

xgb_cv_scores = cross_val_score(xgb_pipeline, X, y, cv=kf, scoring='r2')
print(f"XGBoost CV R2: {np.mean(xgb_cv_scores):.4f}")

xgb_pipeline.fit(X, y)
joblib.dump(xgb_pipeline, '../model/model_xgb.pkl')


XGBoost CV R2: 0.9110


['../model/model_xgb.pkl']

In [64]:
# alphas = [0.1, 0.5, 0.9]
# models = {}
# for alpha in alphas:
#     lgbm_pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', LGBMRegressor(objective='quantile', alpha=alpha, n_estimators=100, random_state=42))
#     ])

#     cv_scores = cross_val_score(lgbm_pipeline, X, y, cv=kf, scoring='r2')
#     print(f"LGBM Quantile α={alpha:.1f} | CV R2: {np.mean(cv_scores):.4f}")

#     # Fit & save
#     lgbm_pipeline.fit(X, y)
#     model_path = f'../model/model_lgbm_q{int(alpha*100)}.pkl'
#     joblib.dump(lgbm_pipeline, model_path)
#     models[alpha] = lgbm_pipeline