In [120]:
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
    BayesianRidge)
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.ensemble import StackingRegressor


# Other essentials
import numpy as np
import pandas as pd
from time import time
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from itertools import combinations
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.max_rows', 250)


In [121]:
df = pd.read_csv('CarPrice_Assignment.csv')
df_model = df.drop(columns=['car_ID', 'CarName', 'symboling'])
categorical_cols = df_model.select_dtypes(include='object').columns.tolist()
numerical_cols = df_model.select_dtypes(include=np.number).drop(columns=['price']).columns.tolist()
all_features = categorical_cols + numerical_cols
listmodels = [
    LinearRegression(),
    Ridge(alpha=1.0),
    Lasso(alpha=0.1),
    ElasticNet(alpha=0.1, l1_ratio=0.5),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(learning_rate=0.1, random_state=42),
    DecisionTreeRegressor(max_depth=5, random_state=42),
    SVR(C=1.0, epsilon=0.2),
    KNeighborsRegressor(n_neighbors=5),
    xgb.XGBRegressor(learning_rate=0.1, random_state=42)
]

In [122]:
def evaluate_feature_combinations(models,n_features):
    results = []
    for feature_comb in combinations(all_features, n_features):
        X = df_model[list(feature_comb)]
        y = df_model['price']

        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), [col for col in feature_comb if col in categorical_cols])
            ],
            remainder='passthrough'
        )

        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', models)
        ])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        start_time = time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        end_time = time()
        elapsed_time = end_time - start_time
        results.append((feature_comb, r2, elapsed_time))

    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    return results_sorted

In [123]:
def evaluate_feature_combinations_cv(models,n_features):
    results = []
    for feature_comb in combinations(all_features, n_features):
        X = df_model[list(feature_comb)]
        y = df_model['price']

        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), [col for col in feature_comb if col in categorical_cols])
            ],
            remainder='passthrough'
        )

        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', models)
        ])
        
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        start_time = time()
        scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
        end_time = time()
        elapsed_time = end_time - start_time
        mean_r2 = scores.mean()
        results.append((feature_comb, mean_r2, elapsed_time))

    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    return results_sorted

In [124]:
results = []
for model in listmodels:
    print(f"Evaluating model: {model.__class__.__name__}")
    eval_results = evaluate_feature_combinations_cv(model, 1)
    for feature_comb, r2_score, exec_time in eval_results:
        results.append({
            'Model': model.__class__.__name__,
            'Feature Combination': feature_comb,
            'R2 Score (CV Mean)': r2_score,
            'Execution Time': exec_time
        })

result_df = pd.DataFrame(results)
result_df = result_df.sort_values(by='R2 Score (CV Mean)', ascending=False)
result_df

Evaluating model: LinearRegression
Evaluating model: Ridge
Evaluating model: Lasso
Evaluating model: ElasticNet
Evaluating model: RandomForestRegressor
Evaluating model: GradientBoostingRegressor
Evaluating model: DecisionTreeRegressor
Evaluating model: SVR
Evaluating model: KNeighborsRegressor
Evaluating model: XGBRegressor


Unnamed: 0,Model,Feature Combination,R2 Score (CV Mean),Execution Time
132,DecisionTreeRegressor,"(enginesize,)",0.8619,0.064
88,RandomForestRegressor,"(enginesize,)",0.8602,1.2841
110,GradientBoostingRegressor,"(enginesize,)",0.8561,0.6361
198,XGBRegressor,"(enginesize,)",0.8532,0.3826
199,XGBRegressor,"(horsepower,)",0.8359,0.418
176,KNeighborsRegressor,"(enginesize,)",0.8166,0.0592
111,GradientBoostingRegressor,"(horsepower,)",0.8,0.6464
200,XGBRegressor,"(carwidth,)",0.7988,0.4475
89,RandomForestRegressor,"(horsepower,)",0.7913,1.308
112,GradientBoostingRegressor,"(carwidth,)",0.7822,0.6669


In [125]:
results = []
for model in listmodels:
    print(f"Evaluating model: {model.__class__.__name__}")
    eval_results = evaluate_feature_combinations_cv(model, 2)
    for feature_comb, r2_score, exec_time in eval_results:
        results.append({
            'Model': model.__class__.__name__,
            'Feature Combination': feature_comb,
            'R2 Score (CV Mean)': r2_score,
            'Execution Time': exec_time
        })

result_df = pd.DataFrame(results)
result_df = result_df.sort_values(by='R2 Score (CV Mean)', ascending=False)
result_df

Evaluating model: LinearRegression
Evaluating model: Ridge
Evaluating model: Lasso
Evaluating model: ElasticNet
Evaluating model: RandomForestRegressor
Evaluating model: GradientBoostingRegressor
Evaluating model: DecisionTreeRegressor
Evaluating model: SVR
Evaluating model: KNeighborsRegressor
Evaluating model: XGBRegressor


Unnamed: 0,Model,Feature Combination,R2 Score (CV Mean),Execution Time
924,RandomForestRegressor,"(wheelbase, enginesize)",0.9074,1.2913
925,RandomForestRegressor,"(enginesize, highwaympg)",0.9015,1.2952
926,RandomForestRegressor,"(carlength, enginesize)",0.8982,1.3519
2079,XGBRegressor,"(wheelbase, enginesize)",0.8977,0.4053
1155,GradientBoostingRegressor,"(enginesize, horsepower)",0.8968,0.6199
...,...,...,...,...
2074,KNeighborsRegressor,"(doornumber, carbody)",-0.2098,0.0814
2075,KNeighborsRegressor,"(fueltype, aspiration)",-0.2483,0.0839
2076,KNeighborsRegressor,"(fueltype, enginelocation)",-0.2733,0.0878
2077,KNeighborsRegressor,"(doornumber, enginelocation)",-0.4356,0.0750


In [None]:
results = []
for model in listmodels:
    print(f"Evaluating model: {model.__class__.__name__}")
    eval_results = evaluate_feature_combinations_cv(model, 3)
    for feature_comb, r2_score, exec_time in eval_results:
        results.append({
            'Model': model.__class__.__name__,
            'Feature Combination': feature_comb,
            'R2 Score (CV Mean)': r2_score,
            'Execution Time': exec_time
        })

result_df = pd.DataFrame(results)
result_df = result_df.sort_values(by='R2 Score (CV Mean)', ascending=False)
result_df

Evaluating model: LinearRegression
Evaluating model: Ridge
Evaluating model: Lasso
Evaluating model: ElasticNet
Evaluating model: RandomForestRegressor
