In [None]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
# Other essentials
import numpy as np
import pandas as pd
from time import time
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.max_rows', 250)


In [None]:
df = pd.read_csv('CarPrice_Assignment_cleaned.csv')
X = df.drop(columns=['price'])
y = df['price']

listmodels = [
    RandomForestRegressor(max_depth=20,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=100,random_state=42),
    xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
]

In [None]:
def greedy_forward_feature_selection_cv(models, cv_folds=5):
    remaining_features = all_features.copy()
    selected_features = []
    history = []

    while remaining_features:
        best_r2 = -1
        best_feature = None
        best_time = 0

        for feature in remaining_features:
            current_features = selected_features + [feature]
            X = df_model[current_features]
            y = df_model['price']

            # Preprocessing
            preprocessor = ColumnTransformer(
                transformers=[
                    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), 
                     [col for col in current_features if col in categorical_cols])
                ],
                remainder='passthrough'
            )

            model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', models)
            ])

            # Cross-validation
            kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
            start_time = time()
            scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
            end_time = time()
            elapsed_time = end_time - start_time
            mean_r2 = scores.mean()

            if mean_r2 > best_r2:
                best_r2 = mean_r2
                best_feature = feature
                best_time = elapsed_time

        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
        history.append((tuple(selected_features), best_r2, best_time))

    historysorted = sorted(history, key=lambda x: x[1], reverse=True)
    return historysorted

In [None]:
results = []
for model in listmodels:
    print(f"Evaluating model: {model.__class__.__name__}")
    eval_results = greedy_forward_feature_selection_cv(model, cv_folds=5)
    for feature_comb, r2_score, exec_time in eval_results:
        results.append({
            'Model': model.__class__.__name__,
            'Feature Combination': feature_comb,
            'R2 Score (CV Mean)': r2_score,
            'Execution Time': exec_time
        })

result_df = pd.DataFrame(results)
result_df = result_df.sort_values(by='R2 Score (CV Mean)', ascending=False)
result_df

Evaluating model: RandomForestRegressor
Evaluating model: XGBRegressor


Unnamed: 0,Model,Feature Combination,R2 Score (CV Mean),Execution Time
0,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg)",0.9322,2.0567
1,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg, comp...",0.9314,2.1597
2,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration)",0.931,1.8355
3,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg, comp...",0.9309,2.1345
4,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype)",0.9308,1.8535
5,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg)",0.9307,1.8497
6,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke)",0.9306,1.8637
7,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation)",0.9303,1.812
8,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth, highwaympg, comp...",0.9303,2.2888
9,RandomForestRegressor,"(enginesize, wheelbase, horsepower, carlength, carheight, curbweight, doornumber, peakrpm, fuelsystem, citympg, enginelocation, fueltype, aspiration, enginetype, stroke, carwidth)",0.9301,2.0821
