In [1]:
pip install lightgbm




In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

In [4]:
X = pd.read_csv("preprocessed_cars.csv")
y = pd.read_csv("target_price.csv")

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### **Linear Regression**

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [7]:
print("Linear Regression RMSE:", mean_squared_error(y_test, y_pred_lr, squared=False))
print("Linear Regression R2:", r2_score(y_test, y_pred_lr))

Linear Regression RMSE: 9924.39635291173
Linear Regression R2: 0.6309010957510905


### **Gradient Boosting Regressor**

In [8]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train.values.ravel())
y_pred_gbr = gbr.predict(X_test)

In [40]:
rmse = mean_squared_error(y_test, y_pred_gbr, squared=False)
r2 = r2_score(y_test, y_pred_gbr)

In [41]:
print("Gradient Boosting Results")
print("Root Mean Squared Error:", rmse)
print("R^2 Score:", r2)

Gradient Boosting Results
Root Mean Squared Error: 8609.830420207834
R^2 Score: 0.722205431376266


### **Random Forest Regressor**

In [11]:
rf = RandomForestRegressor(n_estimators=20, random_state=42)
rf.fit(X_train, y_train.values.ravel())
y_pred_rf = rf.predict(X_test)

In [12]:
print("Random Forest RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("Random Forest R2:", r2_score(y_test, y_pred_rf))

Random Forest RMSE: 5647.498658942838
Random Forest R2: 0.8804782817658328


### **Light Gradient Boosting**

In [13]:
lgb_model = LGBMRegressor(
    n_estimators=1000,     
    learning_rate=0.1,
    max_depth=-1,
    random_state=42
)

In [14]:
lgb_model.fit(
    X_train, y_train.values.ravel(),
    eval_set=[(X_test, y_test.values.ravel())],
    eval_metric='rmse'
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16065
[LightGBM] [Info] Number of data points in the train set: 552296, number of used features: 63
[LightGBM] [Info] Start training from score 30759.424484


In [15]:
y_pred_lgb = lgb_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred_lgb, squared=False)
r2 = r2_score(y_test, y_pred_lgb)

In [16]:
print("LightGBM RMSE:", rmse)
print("LightGBM R^2:", r2)

LightGBM RMSE: 5705.3237853874825
LightGBM R^2: 0.8780181691297103


### **XGBoosting**

In [17]:
xgb_model = XGBRegressor(
    n_estimators=150,       
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    objective='reg:squarederror'
)

In [18]:
xgb_model.fit(X_train, y_train.values.ravel(), eval_set=[(X_test, y_test.values.ravel())])
y_pred_xgb = xgb_model.predict(X_test)

[0]	validation_0-rmse:15429.67364
[1]	validation_0-rmse:14616.58374
[2]	validation_0-rmse:13910.55643
[3]	validation_0-rmse:13259.31646
[4]	validation_0-rmse:12707.02709
[5]	validation_0-rmse:12214.95237
[6]	validation_0-rmse:11797.36295
[7]	validation_0-rmse:11409.93069
[8]	validation_0-rmse:11052.52952
[9]	validation_0-rmse:10772.21305
[10]	validation_0-rmse:10494.48124
[11]	validation_0-rmse:10250.32122
[12]	validation_0-rmse:10029.49514
[13]	validation_0-rmse:9828.76619
[14]	validation_0-rmse:9650.41508
[15]	validation_0-rmse:9481.47325
[16]	validation_0-rmse:9325.25219
[17]	validation_0-rmse:9196.57516
[18]	validation_0-rmse:9071.48202
[19]	validation_0-rmse:8951.37431
[20]	validation_0-rmse:8840.25355
[21]	validation_0-rmse:8753.10393
[22]	validation_0-rmse:8662.31069
[23]	validation_0-rmse:8579.92815
[24]	validation_0-rmse:8508.04202
[25]	validation_0-rmse:8440.24481
[26]	validation_0-rmse:8366.84486
[27]	validation_0-rmse:8300.29518
[28]	validation_0-rmse:8241.25132
[29]	valida

In [19]:
rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
r2 = r2_score(y_test, y_pred_xgb)

print("XGBoost RMSE:", rmse)
print("XGBoost R^2:", r2)

XGBoost RMSE: 6570.709879901799
XGBoost R^2: 0.838207219406396


### **Hyperparameter Tuning**

### **LGB Tuning**

In [20]:
lgb_param_grid = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15, -1],
    'num_leaves': [31, 63, 127],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [21]:
lgb_model = LGBMRegressor(random_state=42)
random_search_lgb = RandomizedSearchCV(
    lgb_model, lgb_param_grid, n_iter=20, cv=3,
    scoring='neg_root_mean_squared_error', n_jobs=-1, random_state=42
)

In [22]:
random_search_lgb.fit(X_train, y_train.values.ravel())
print("Best LGBM Params:", random_search_lgb.best_params_)
print("Best LGBM RMSE:", -random_search_lgb.best_score_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16065
[LightGBM] [Info] Number of data points in the train set: 552296, number of used features: 63
[LightGBM] [Info] Start training from score 30759.424484
Best LGBM Params: {'subsample': 0.6, 'num_leaves': 127, 'n_estimators': 1500, 'min_child_samples': 20, 'max_depth': 15, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best LGBM RMSE: 5400.242631075926


In [23]:
best_lgb_model = random_search_lgb.best_estimator_

y_train_pred = best_lgb_model.predict(X_train)
y_test_pred = best_lgb_model.predict(X_test)

In [24]:
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [25]:
print("Train RMSE:", train_rmse)
print("Train MAE:", train_mae)
print("Train R²:", train_r2)

print("\nTest RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R²:", test_r2)

print("\nBest LGB Params:", random_search_lgb.best_params_)
print("Validation RMSE (CV):", -random_search_lgb.best_score_)

Train RMSE: 4197.717268940712
Train MAE: 2904.9012859742375
Train R²: 0.9340051617699818

Test RMSE: 5306.16068781092
Test MAE: 3448.887215034038
Test R²: 0.8944895803535876

Best LGB Params: {'subsample': 0.6, 'num_leaves': 127, 'n_estimators': 1500, 'min_child_samples': 20, 'max_depth': 15, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Validation RMSE (CV): 5400.242631075926


### **Random Forest Tuning**

In [26]:
rf_param_grid = {
    'n_estimators': [100, 200],          
    'max_depth': [10, 20, None],          
    'min_samples_split': [2, 5],          
    'min_samples_leaf': [1, 2],           
    'max_features': ['sqrt', 'log2']      
}

In [27]:
rf_model = RandomForestRegressor(random_state=42)
random_search_rf = RandomizedSearchCV(
    rf_model, rf_param_grid, n_iter=20, cv=3,
    scoring='neg_root_mean_squared_error', n_jobs=2, random_state=42
)

In [28]:
random_search_rf.fit(X_train, y_train.values.ravel())
print("Best RF Params:", random_search_rf.best_params_)
print("Best RF RMSE:", -random_search_rf.best_score_)

Best RF Params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}
Best RF RMSE: 5771.239322130953


In [29]:
best_rf = random_search_rf.best_estimator_

y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

In [30]:
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [31]:
print("Train RMSE:", train_rmse)
print("Train MAE:", train_mae)
print("Train R²:", train_r2)

print("\nTest RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R²:", test_r2)

print("\nBest RF Params:", random_search_rf.best_params_)
print("Validation RMSE (CV):", -random_search_rf.best_score_)

Train RMSE: 2704.6864762992527
Train MAE: 1676.5577919579086
Train R²: 0.9726020884218682

Test RMSE: 5575.833591181349
Test MAE: 3558.9970471051724
Test R²: 0.8834924250286198

Best RF Params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}
Validation RMSE (CV): 5771.239322130953


### **XGB Tuning**

In [32]:
xgb_param_grid = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

In [33]:
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror', tree_method='hist')
random_search_xgb = RandomizedSearchCV(
    xgb_model, xgb_param_grid, n_iter=20, cv=3,
    scoring='neg_root_mean_squared_error', n_jobs=4, random_state=42
)

In [34]:
random_search_xgb.fit(X_train, y_train.values.ravel())
print("Best XGB Params:", random_search_xgb.best_params_)
print("Best XGB RMSE:", -random_search_xgb.best_score_)

Best XGB Params: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Best XGB RMSE: 5570.723571048205


In [35]:
best_xgb = random_search_xgb.best_estimator_

y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

In [36]:
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [37]:
print("Train RMSE:", train_rmse)
print("Train MAE:", train_mae)
print("Train R²:", train_r2)

print("\nTest RMSE:", test_rmse)
print("Test MAE:", test_mae)
print("Test R²:", test_r2)

print("\nBest XGB Params:", random_search_xgb.best_params_)
print("Validation RMSE (CV):", -random_search_xgb.best_score_)

Train RMSE: 4430.156913601413
Train MAE: 3076.420358072595
Train R²: 0.9264941644529384

Test RMSE: 5475.329640118488
Test MAE: 3571.8540430690873
Test R²: 0.8876546512796727

Best XGB Params: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Validation RMSE (CV): 5570.723571048205


## **Finalized Model**

In [38]:
best_model = random_search_lgb.best_estimator_
best_model.fit(X_train, y_train.values.ravel())
y_pred_final = best_model.predict(X_test)

print("Final Model RMSE:", mean_squared_error(y_test, y_pred_final, squared=False))
print("Final Model R2:", r2_score(y_test, y_pred_final))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16065
[LightGBM] [Info] Number of data points in the train set: 552296, number of used features: 63
[LightGBM] [Info] Start training from score 30759.424484
Final Model RMSE: 5306.16068781092
Final Model R2: 0.8944895803535876


In [43]:
with open("final_model.pkl", "wb") as f:
    pickle.dump(best_model, f)