In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [22]:
path = 'cleaned_car_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,brand_0,brand_1,brand_2,brand_3,brand_4,brand_5,model,model_year,milage,fuel_type,ext_col,int_col,accident,price,engin_hp,engine_displacement,transmission_type,num_gears
0,0,0,0,0,0,1,1,2013,51000,4,0,4,1,10300,300.0,3.7,0,6
1,0,0,0,0,1,0,1,2021,34742,1,-1,2,1,38005,265.27976,3.8,1,8
2,0,0,0,0,1,1,1,2022,22372,1,-1,4,0,54598,265.27976,3.5,1,6
3,0,0,0,1,0,0,1,2015,88900,3,0,4,0,15500,354.0,3.5,0,7
4,0,0,0,1,0,1,1,2021,9835,1,-1,4,0,34999,265.27976,2.0,1,8


In [23]:
df['car_age'] = 2024 - df['model_year']

In [95]:
df.skew()

brand_0                2.174529
brand_1                0.773334
brand_2                0.535272
brand_3                0.106594
brand_4               -0.032964
brand_5               -0.458425
model                 -0.643352
model_year            -0.318872
milage                -1.413768
fuel_type              2.598193
ext_col               -0.022609
int_col                0.315403
accident               1.128366
engin_hp              -0.175272
engine_displacement   -0.011478
transmission_type      1.647703
num_gears             -0.369509
car_age               -0.318872
price                  0.023367
dtype: float64

In [56]:
def transform_column(df):
    for column in df.columns:
        if df[column].skew() > 0.5:
            df[column] = np.log1p(df[column])  
        elif df[column].skew() < -0.5:
            df[column] = np.log1p(df[column].max() - df[column])  
    return df


In [57]:
df = transform_column(df) 

In [72]:
df.shape

(3703, 19)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3703 entries, 0 to 3702
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand_0              3703 non-null   float64
 1   brand_1              3703 non-null   float64
 2   brand_2              3703 non-null   float64
 3   brand_3              3703 non-null   int64  
 4   brand_4              3703 non-null   int64  
 5   brand_5              3703 non-null   int64  
 6   model                3703 non-null   float64
 7   model_year           3703 non-null   float64
 8   milage               3703 non-null   float64
 9   fuel_type            3703 non-null   float64
 10  ext_col              3703 non-null   float64
 11  int_col              3703 non-null   float64
 12  accident             3703 non-null   float64
 13  price                3703 non-null   float64
 14  engin_hp             3703 non-null   float64
 15  engine_displacement  3703 non-null   f

In [59]:
x = df.drop(columns=['price'] , axis=1)
y = df['price']

In [60]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , random_state=42 , 
                                                       test_size=42)

In [63]:
# code to remove the infinit value from the data set 
# some of columns conation infinite values!
def clean_data(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.mean(), inplace=True)
    return df

x_train = clean_data(x_train)
x_test = clean_data(x_test)

In [85]:
# Replace these with your actual feature names
feature_columns = ['brand_0', 'brand_1', 'brand_2', 'brand_3', 'brand_4', 'brand_5', 'model', 'model_year', 'milage', 
                   'fuel_type', 'ext_col', 'int_col', 'accident', 'engin_hp', 'engine_displacement', 'transmission_type', 'num_gears' , 'car_age']

In [89]:
import pandas as pd


x_train_df = pd.DataFrame(x_train, columns=feature_columns) 
x_test_df = pd.DataFrame(x_test, columns=feature_columns)      
y_train_df = pd.DataFrame(y_train, columns=['price'])        
y_test_df = pd.DataFrame(y_test, columns=['price'])           

train_df = pd.concat([x_train_df, y_train_df], axis=1)
test_df = pd.concat([x_test_df, y_test_df], axis=1)

df = pd.concat([train_df, test_df], axis=0)


In [90]:
df.head(1)

Unnamed: 0,brand_0,brand_1,brand_2,brand_3,brand_4,brand_5,model,model_year,milage,fuel_type,ext_col,int_col,accident,engin_hp,engine_displacement,transmission_type,num_gears,car_age,price
0,-0.390314,-0.687062,1.300735,-0.950682,-1.016248,0.797393,0.554549,-0.666757,0.220588,-0.36591,0.169686,1.477127,-0.584815,1.719023,0.987167,-0.478665,-0.423437,-0.666757,9.239996


In [64]:

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVC

from xgboost import XGBRegressor, XGBRFRegressor
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def check_all_models(X_train, X_test, y_train, y_test):
    # # Scaling features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    models = {
        'Linear Regression': LinearRegression(),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'XGBoostRandomForest': XGBRFRegressor(),
    }

    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5  
        r2 = r2_score(y_test, y_pred)
        
        train_score = model.score(X_train, y_train) 
        test_score = model.score(X_test, y_test)    
        
        overfitting = test_score < train_score
        underfitting = train_score < 0.5 and test_score < 0.5
        
        results.append({
            'Model': name,
            'Train R²': train_score,
            'Test R²': test_score,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R²': r2,
            'Overfitting': overfitting,
            'Underfitting': underfitting
        })

    model_results_df = pd.DataFrame(results)
    
    display(model_results_df)


In [65]:
check_all_models(x_train, x_test, y_train, y_test)

Unnamed: 0,Model,Train R²,Test R²,MAE,MSE,RMSE,R²,Overfitting,Underfitting
0,Linear Regression,0.720947,0.594509,0.35221,0.233375,0.483089,0.594509,True,False
1,K-Nearest Neighbors,0.853461,0.704018,0.321207,0.170348,0.412733,0.704018,True,False
2,Decision Tree,1.0,0.631402,0.307987,0.212142,0.460589,0.631402,True,False
3,Random Forest,0.980377,0.70371,0.273827,0.170526,0.412948,0.70371,True,False
4,Gradient Boosting,0.875207,0.773271,0.245104,0.130491,0.361235,0.773271,True,False
5,AdaBoost,0.768042,0.585616,0.351598,0.238493,0.488358,0.585616,True,False
6,XGBoost,0.988315,0.832776,0.216595,0.096244,0.310232,0.832776,True,False
7,XGBoostRandomForest,0.849373,0.695097,0.283041,0.175483,0.418907,0.695097,True,False


### Hyperparemeter Tuning for best 03 Model's!

In [66]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor()
}

param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['auto', 'sqrt']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

best_models = {}

for model_name in models:
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(models[model_name], param_grids[model_name], cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
    grid_search.fit(x_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_

    cv_score = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f"{model_name} - Cross-validation RMSE: {-cv_score.mean():.4f}")


Running GridSearchCV for Random Forest...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Random Forest - Cross-validation RMSE: 0.0986
Running GridSearchCV for Gradient Boosting...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Gradient Boosting - Cross-validation RMSE: 0.0872
Running GridSearchCV for XGBoost...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
XGBoost - Cross-validation RMSE: 0.0811


In [67]:
for model_name, model in best_models.items():
    test_score = model.score(x_test, y_test)
    print(f"{model_name} - Test R²: {test_score:.4f}")

Random Forest - Test R²: 0.7638
Gradient Boosting - Test R²: 0.7704
XGBoost - Test R²: 0.8107


In [68]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Scaling the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

xgboost_model = xgb.XGBRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=xgboost_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(x_train, y_train)

best_xgboost_model = grid_search.best_estimator_

y_pred = best_xgboost_model.predict(x_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters from GridSearchCV: {grid_search.best_params_}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²: {r2:.4f}")


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, m

In [71]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

best_params = {
    'learning_rate': 0.05,
    'max_depth': 7,
    'n_estimators': 300,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0
}

xgb_model = xgb.XGBRegressor(**best_params)

xgb_model.fit(x_train, y_train)

y_pred_train = xgb_model.predict(x_train)
y_pred_test = xgb_model.predict(x_test)

rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

if r2_train > r2_test:
    if rmse_train < rmse_test:
        print("Overfitting detected")
    else:
        print("Possible overfitting, need tuning")
else:
    print("No overfitting detected")

print(f"Train RMSE: {rmse_train:.4f}")
print(f"Test RMSE: {rmse_test:.4f}")
print(f"Train R²: {r2_train:.4f}")
print(f"Test R²: {r2_test:.4f}")
print("r2 score is: " , r2_score(y_test , y_pred))


Overfitting detected
Train RMSE: 0.0942
Test RMSE: 0.3023
Train R²: 0.9869
Test R²: 0.8412
r2 score is:  0.8412481011892343


In [None]:
# Corrected input data with 18 features
input_data = np.array([[0.0, 0.0, 0.0, 0, 0, 1, 0.693147, 2.484907, 10.839601, 1.609438, 0.0, 0.0, 0.693147, 
                        0.0, 5.70711, 1.547563, 0.0, 1.94591]])

predicted_price = xgb_model.predict(input_data)
print(f"Predicted Price: {predicted_price[0]:.4f}")

Predicted Price: 9.2170
