In [16]:
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
    BayesianRidge)
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import StackingRegressor

# Other essentials
import numpy as np
import pandas as pd
from time import time
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.max_rows', 250)


In [17]:
df = pd.read_csv('CarPrice_Assignment.csv')
df_model = df.drop(columns=['car_ID', 'CarName', 'symboling'])
categorical_cols = df_model.select_dtypes(include='object').columns.tolist()
numerical_cols = df_model.select_dtypes(include=np.number).drop(columns=['price']).columns.tolist()
all_features = categorical_cols + numerical_cols
listmodels = [
    LinearRegression(),
    Ridge(alpha=1.0),
    Lasso(alpha=0.1),
    ElasticNet(alpha=0.1, l1_ratio=0.5),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(learning_rate=0.1, random_state=42),
    DecisionTreeRegressor(max_depth=5, random_state=42),
    SVR(C=1.0, epsilon=0.2),
    KNeighborsRegressor(n_neighbors=5),
    xgb.XGBRegressor(learning_rate=0.1, random_state=42)
]

In [18]:
def greedy_forward_feature_selection(models):
    remaining_features = all_features.copy()
    selected_features = []
    history = []

    while remaining_features:
        best_r2 = -1
        best_feature = None
        best_time = 0

        for feature in remaining_features:
            current_features = selected_features + [feature]
            X = df_model[current_features]
            y = df_model['price']

            # Preprocessing
            preprocessor = ColumnTransformer(
                transformers=[
                    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), 
                     [col for col in current_features if col in categorical_cols])
                ],
                remainder='passthrough'
            )

            model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', models)
            ])

            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            start_time = time()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            end_time = time()

            elapsed_time = end_time - start_time

            if r2 > best_r2:
                best_r2 = r2
                best_feature = feature
                best_time = elapsed_time

        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
        history.append((tuple(selected_features), best_r2, best_time))
        # print(f"[Step {len(selected_features)}] Added: {best_feature} | R²: {best_r2:.4f} | Time: {best_time:.2f}s")
        # buat dalam bentuk df
    history = pd.DataFrame(history, columns=['Selected Features', 'R2 Score', 'Execution Time'])
    return history

In [None]:
def greedy_forward_feature_selection_cv(models, cv_folds=5):
    remaining_features = all_features.copy()
    selected_features = []
    history = []

    while remaining_features:
        best_r2 = -1
        best_feature = None
        best_time = 0

        for feature in remaining_features:
            current_features = selected_features + [feature]
            X = df_model[current_features]
            y = df_model['price']

            # Preprocessing
            preprocessor = ColumnTransformer(
                transformers=[
                    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), 
                     [col for col in current_features if col in categorical_cols])
                ],
                remainder='passthrough'
            )

            model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', models)
            ])

            # Cross-validation
            kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
            start_time = time()
            scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
            end_time = time()
            elapsed_time = end_time - start_time
            mean_r2 = scores.mean()

            if mean_r2 > best_r2:
                best_r2 = mean_r2
                best_feature = feature
                best_time = elapsed_time

        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
        history.append((tuple(selected_features), best_r2, best_time))

    historysorted = sorted(history, key=lambda x: x[1], reverse=True)
    return historysorted

In [20]:
results = []
for model in listmodels:
    print(f"Evaluating model: {model.__class__.__name__}")
    eval_results = greedy_forward_feature_selection_cv(model, cv_folds=5)
    for feature_comb, r2_score, exec_time in eval_results:
        results.append({
            'Model': model.__class__.__name__,
            'Feature Combination': feature_comb,
            'R2 Score (CV Mean)': r2_score,
            'Execution Time': exec_time
        })

result_df = pd.DataFrame(results)
result_df = result_df.sort_values(by='R2 Score (CV Mean)', ascending=False)
result_df

Evaluating model: LinearRegression
Evaluating model: Ridge
Evaluating model: Lasso
Evaluating model: ElasticNet
Evaluating model: RandomForestRegressor
Evaluating model: GradientBoostingRegressor
Evaluating model: DecisionTreeRegressor
Evaluating model: SVR
Evaluating model: KNeighborsRegressor
Evaluating model: XGBRegressor


Unnamed: 0,Model,Feature Combination,R2 Score (CV Mean),Execution Time
199,XGBRegressor,"(enginesize, wheelbase, horsepower, carlength, curbweight, carheight, enginetype, fueltype, enginelocation)",0.9329,0.7382
198,XGBRegressor,"(enginesize, wheelbase, horsepower, carlength, curbweight, carheight, enginetype, fueltype)",0.9329,0.6245
200,XGBRegressor,"(enginesize, wheelbase, horsepower, carlength, curbweight, carheight, enginetype, fueltype, enginelocation, stroke)",0.9325,0.6171
88,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg)",0.9322,2.5746
201,XGBRegressor,"(enginesize, wheelbase, horsepower, carlength, curbweight, carheight, enginetype, fueltype, enginelocation, stroke, fuelsystem, doornumber)",0.9318,0.6449
202,XGBRegressor,"(enginesize, wheelbase, horsepower, carlength, curbweight, carheight, enginetype, fueltype, enginelocation, stroke, fuelsystem)",0.9317,0.6156
89,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg, comp...",0.9314,3.1872
203,XGBRegressor,"(enginesize, wheelbase, horsepower, carlength, curbweight, carheight, enginetype)",0.9312,0.5733
90,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration)",0.931,1.9629
91,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg, comp...",0.9309,2.3086
