In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
df = pd.read_csv('cardekho_imputated.csv',index_col=0)
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data Cleaning

#### Handling Missing values

In [16]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [17]:
df.drop(columns=['car_name','brand'],axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [19]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print("Num of Numerical Features :", len(num_features))
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print("Num of Categorical Features :", len(cat_features))
discrete_feature = [feature for feature in num_features if len(df[feature].unique())<25]
print("Num of Discrete Features :", len(discrete_feature))
continuous_feature = [feature for feature in num_features if feature not in discrete_feature]
print("Num of Continuous Features :", len(continuous_feature))

Num of Numerical Features : 7
Num of Categorical Features : 4
Num of Discrete Features : 2
Num of Continuous Features : 5


In [20]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['selling_price'],axis=1)
y = df['selling_price']

## Feature Encoding and Scaling

#### One Hot Encoding for Columns which has lesser unique values not discreate

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [22]:
num_features = X.select_dtypes(exclude='object').columns
ohehot_columns = ['fuel_type','transmission_type','seller_type']

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,ohehot_columns),
        ("StandardScaler",numeric_transformer,num_features)
    ],remainder='passthrough'
)

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Model Training and Model Selection

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,roc_auc_score,roc_curve,r2_score

In [41]:
def evaluate_model(true,predicted):
    mea = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    r2_scr = r2_score(true,predicted)
    return mea,mse,rmse,r2_scr

In [42]:
models = {
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(f"Model Name : {list(models.keys())[i]}")
    print("--------------------------------------------------")
    print(f"Train MAE : {model_train_mae}, Test MAE : {model_test_mae}")
    print(f"Train MSE : {model_train_mse}, Test MSE : {model_test_mse}")
    print(f"Train RMSE : {model_train_rmse}, Test RMSE : {model_test_rmse}")
    print(f"Train R2 : {model_train_r2}, Test R2 : {model_test_r2}")
    print('\n')

Model Name : LinearRegression
--------------------------------------------------
Train MAE : 268101.60708299326, Test MAE : 279618.57941584237
Train MSE : 306756099359.75964, Test MSE : 252550062888.56573
Train RMSE : 553855.6665411664, Test RMSE : 502543.5930230986
Train R2 : 0.6217719576765959, Test R2 : 0.6645109298852003


Model Name : Ridge
--------------------------------------------------
Train MAE : 268060.0140124582, Test MAE : 279557.4540451886
Train MSE : 306756818582.0534, Test MSE : 252540889637.0159
Train RMSE : 553856.3158275379, Test RMSE : 502534.46611851
Train R2 : 0.6217710708807318, Test R2 : 0.664523115689461


Model Name : Lasso
--------------------------------------------------
Train MAE : 268099.22866122884, Test MAE : 279614.75677125243
Train MSE : 306756104247.9428, Test MSE : 252549201782.8731
Train RMSE : 553855.6709540336, Test RMSE : 502542.73627510836
Train R2 : 0.6217719516495015, Test R2 : 0.6645120737833439


Model Name : ElasticNet
-------------------

In [43]:
knn_params = {'n_neighbors': [1,2,5,6,7,9,10,15,20,30,35,40,50]}
rf_params = {'n_estimators':[50,100,200,300,400,500,600,700],'max_depth':[None,5,10,15],'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,4]}

In [45]:
randomcv_models = [
    ('KNeighborsRegressor',KNeighborsRegressor(),knn_params),
    ('RandomForestRegressor',RandomForestRegressor(),rf_params)
]

In [46]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
model_params = {}
for name,model,params in randomcv_models:
    random_cv = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,scoring='neg_mean_squared_error',cv=cv,verbose=3,n_jobs=-1,refit=True)
    random_cv.fit(X_train,y_train)
    model_params[name] = random_cv.best_params_




Fitting 5 folds for each of 13 candidates, totalling 65 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits




In [47]:
for model_name in model_params:
    print(f"Best parameters for {model_name}: {model_params[model_name]}")

Best parameters for KNeighborsRegressor: {'n_neighbors': 5}
Best parameters for RandomForestRegressor: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}


In [48]:
models = {
    "Random Forest Regressor":RandomForestRegressor(n_estimators=200,min_samples_split=2,min_samples_leaf=1,max_depth=None),
    "KNeighborsRegressor":KNeighborsRegressor(n_neighbors=5)
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(f"Model Name : {list(models.keys())[i]}")
    print("--------------------------------------------------")
    print(f"Train MAE : {model_train_mae}, Test MAE : {model_test_mae}")
    print(f"Train MSE : {model_train_mse}, Test MSE : {model_test_mse}")
    print(f"Train RMSE : {model_train_rmse}, Test RMSE : {model_test_rmse}")
    print(f"Train R2 : {model_train_r2}, Test R2 : {model_test_r2}")
    print('\n')

Model Name : Random Forest Regressor
--------------------------------------------------
Train MAE : 39500.75152971326, Test MAE : 101618.21861709039
Train MSE : 16383693385.688322, Test MSE : 52072643383.76283
Train RMSE : 127998.80228224138, Test RMSE : 228194.31058587512
Train R2 : 0.9797990250618349, Test R2 : 0.9308263775212502


Model Name : KNeighborsRegressor
--------------------------------------------------
Train MAE : 92495.19792342634, Test MAE : 112934.39020434642
Train MSE : 103356413030.49968, Test MSE : 66442643222.713264
Train RMSE : 321490.9221587752, Test RMSE : 257764.70515319443
Train R2 : 0.8725622934843486, Test R2 : 0.9117371805977614




In [None]:
# Best Hyperparameter Grids for Top Performing Models
# Based on industry best practices and Kaggle competitions

import numpy as np
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
import time

# 1. Decision Tree Regressor
decision_tree_params = {
    'max_depth': [3, 5, 7, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['squared_error', 'friedman_mse'],
    'splitter': ['best', 'random']
}

# 2. Random Forest Regressor (Most Important Parameters)
random_forest_params = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [3, 5, 7, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', 0.3, 0.5, 0.7],
    'bootstrap': [True, False],
    'oob_score': [True, False]  # Only when bootstrap=True
}

# 3. LightGBM Regressor (Gradient Boosting)
lightgbm_params = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [3, 5, 7, 10, 15, -1],  # -1 means no limit
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'num_leaves': [15, 31, 50, 100, 200],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1.0], # L2 regularization
    'min_child_samples': [5, 10, 20, 30]
}

# 4. XGBoost Regressor
xgboost_params = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1.0], # L2 regularization
    'gamma': [0, 0.1, 0.2, 0.5],      # Minimum split loss
    'min_child_weight': [1, 3, 5, 7]
}

# 5. CatBoost Regressor
catboost_params = {
    'iterations': [100, 200, 300, 500, 1000],
    'depth': [3, 4, 5, 6, 7, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7, 9],    # L2 regularization
    'border_count': [32, 64, 128, 255], # Feature discretization
    'bagging_temperature': [0, 0.5, 1.0],
    'random_strength': [0, 1, 2, 3],
    'od_type': ['IncToDec', 'Iter'],    # Overfitting detection
    'od_wait': [10, 20, 30, 50]
}

print("🎯 HYPERPARAMETER GRIDS FOR BEST MODELS")
print("="*50)
print(f"Decision Tree parameters: {len(decision_tree_params)} hyperparameters")
print(f"Random Forest parameters: {len(random_forest_params)} hyperparameters")
print(f"LightGBM parameters: {len(lightgbm_params)} hyperparameters")
print(f"XGBoost parameters: {len(xgboost_params)} hyperparameters")
print(f"CatBoost parameters: {len(catboost_params)} hyperparameters")

# Calculate total combinations (for reference)
dt_combinations = np.prod([len(v) for v in decision_tree_params.values()])
rf_combinations = np.prod([len(v) for v in random_forest_params.values()])
lgb_combinations = np.prod([len(v) for v in lightgbm_params.values()])
xgb_combinations = np.prod([len(v) for v in xgboost_params.values()])
cat_combinations = np.prod([len(v) for v in catboost_params.values()])

print("\n📊 TOTAL POSSIBLE COMBINATIONS:")
print(f"Decision Tree: {dt_combinations:,}")
print(f"Random Forest: {rf_combinations:,}")
print(f"LightGBM: {lgb_combinations:,}")
print(f"XGBoost: {xgb_combinations:,}")
print(f"CatBoost: {cat_combinations:,}")

print("\n💡 RECOMMENDATIONS:")
print("• Use RandomizedSearchCV with n_iter=100-200 for faster results")
print("• Use GridSearchCV only for final tuning with smaller parameter space")
print("• Start with fewer parameters, then expand based on results")
print("• Use early stopping for tree-based models to prevent overfitting")

# PRACTICAL IMPLEMENTATION - REDUCED PARAMETER GRIDS FOR FASTER SEARCH
# Use these for initial hyperparameter tuning

# Reduced grids for faster RandomizedSearchCV
quick_decision_tree_params = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', None]
}

quick_random_forest_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 0.5]
}

quick_lightgbm_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 7, 10, -1],
    'learning_rate': [0.05, 0.1, 0.15],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

quick_xgboost_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.15],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5]
}

quick_catboost_params = {
    'iterations': [100, 200, 500],
    'depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.15],
    'l2_leaf_reg': [1, 3, 5]
}

print("\n⚡ QUICK SEARCH GRIDS (Recommended for initial tuning)")
print("="*55)
print("Use these reduced grids with RandomizedSearchCV for faster results:")
print(f"• Quick Decision Tree: {np.prod([len(v) for v in quick_decision_tree_params.values()]):,} combinations")
print(f"• Quick Random Forest: {np.prod([len(v) for v in quick_random_forest_params.values()]):,} combinations")
print(f"• Quick LightGBM: {np.prod([len(v) for v in quick_lightgbm_params.values()]):,} combinations")
print(f"• Quick XGBoost: {np.prod([len(v) for v in quick_xgboost_params.values()]):,} combinations")
print(f"• Quick CatBoost: {np.prod([len(v) for v in quick_catboost_params.values()]):,} combinations")

# EXAMPLE IMPLEMENTATION WITH RANDOMIZEDSEARCHCV
def run_hyperparameter_tuning(model, param_grid, model_name, X_train, y_train, n_iter=50):
    """
    Run hyperparameter tuning using RandomizedSearchCV
    """
    print(f"\n🔍 Tuning {model_name}...")

    # Set up cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Create RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=n_iter,  # Number of parameter combinations to try
        cv=cv,
        scoring='r2',  # or 'neg_mean_squared_error'
        n_jobs=-1,     # Use all CPU cores
        verbose=1,     # Print progress
        random_state=42,
        refit=True     # Refit on best parameters
    )

    # Fit the model
    start_time = time.time()
    random_search.fit(X_train, y_train)
    end_time = time.time()

    print(f"⏱️  Tuning completed in {end_time - start_time:.2f} seconds")
    print(f"🏆 Best Score (R2): {random_search.best_score_:.4f}")
    print(f"🎯 Best Parameters: {random_search.best_params_}")

    return random_search.best_estimator_, random_search.best_params_

# Example usage (uncomment to run):
#
# # Tune Random Forest
# best_rf, best_rf_params = run_hyperparameter_tuning(
#     model=RandomForestRegressor(random_state=42),
#     param_grid=quick_random_forest_params,
#     model_name="Random Forest",
#     X_train=X_train,
#     y_train=y_train,
#     n_iter=50
# )
#
# # Tune Decision Tree
# best_dt, best_dt_params = run_hyperparameter_tuning(
#     model=DecisionTreeRegressor(random_state=42),
#     param_grid=quick_decision_tree_params,
#     model_name="Decision Tree",
#     X_train=X_train,
#     y_train=y_train,
#     n_iter=30
# )

print("\n📝 IMPLEMENTATION NOTES:")
print("1. Start with 'quick_' parameter grids for faster results")
print("2. Use n_iter=50-100 for RandomizedSearchCV")
print("3. Increase n_iter for final tuning")
print("4. Use the full parameter grids for competition-level performance")
print("5. Consider using early_stopping_rounds for XGBoost/LightGBM/CatBoost")
print("6. For regression tasks, use 'r2' or 'neg_mean_squared_error' as scoring")
print("7. Use KFold instead of StratifiedKFold for regression problems")