### Importing libraries

In [59]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
import joblib

import warnings
warnings.filterwarnings("ignore")

### Loading Data

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/Basharat-Asghar/PakWheels-car-Price-Prediction/refs/heads/main/data/cleaned_df.csv')

In [8]:
df.head()

Unnamed: 0,year,fuel,transmission,price_pkr_lacs,mileage_km,engine_type,company,model,engine_cc,battery_kwh,car_age,boxcox_mileage
0,2016,Hybrid,Automatic,235.0,75943,ICE,Porsche,Other,3000.0,,9,313.502884
1,2022,Petrol,Automatic,70.0,12319,ICE,Proton,X70,1500.0,,3,139.877942
2,2014,Petrol,Manual,7.0,300000,ICE,Faw,X-PV,1000.0,,11,574.756158
3,2023,Petrol,Automatic,40.0,25762,ICE,Honda,N Wgn,660.0,,2,194.220223
4,2017,Petrol,Manual,17.5,147219,ICE,Suzuki,Wagon R,1000.0,,8,419.939567


#### Custom Transformer: Frequency Encoding

In [9]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.freq_maps = {}

    def fit(self, X, y=None):
      for col in X.columns:
          freqs = X[col].value_counts(normalize=True)
          self.freq_maps[col] = freqs
      return self

    def transform(self, X):
      X_encoded = X.copy()
      for col in X.columns:
          X_encoded[col] = X_encoded[col].map(self.freq_maps[col]).fillna(0)
      return X_encoded

#### Adding missingness flag

In [14]:
def add_missing_flags(df):
  df = df.copy()
  df['engine_cc_missing'] = df['engine_cc'].isna().astype(int)
  df['battery_kwh_missing'] = df['battery_kwh'].isna().astype(int)

  df['engine_cc'] = df['engine_cc'].fillna(0)
  df['battery_kwh'] = df['battery_kwh'].fillna(0)

  return df

In [15]:
missing_flagger = FunctionTransformer(add_missing_flags, validate=False)

In [17]:
num_features = ["year", "mileage_km", "engine_cc", "battery_kwh", "car_age", "boxcox_mileage", "engine_cc_missing", "battery_kwh_missing"]
cat_features = ["fuel", "transmission", "engine_type", "company"]
freq_features = ["model"]

Creating pipelines for scaling, frequency encoding, and one hot encoding.

In [18]:
num_transformer = Pipeline(
    steps = [(
        "scaler", RobustScaler()
    )]
)

freq_transformer = Pipeline(
    steps = [(
        "freq", FrequencyEncoder()
    )]
)

onehot_transformer = Pipeline(
    steps = [(
        "onehot", OneHotEncoder(handle_unknown="ignore")
    )]
)

In [24]:
preprocessor = Pipeline(
    steps = [
        ("missing_flags", missing_flagger),
        ("transformers", ColumnTransformer(
            transformers = [
                ("num", num_transformer, num_features),
                ("freq", freq_transformer, freq_features),
                ("onehot", onehot_transformer, cat_features),
            ]
        ))
    ]
)

#### Splitting train, test data

In [20]:
X = df.drop(columns=["price_pkr_lacs"])
y = df['price_pkr_lacs']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [27]:
def eval_model(pipeline, X_train, X_test, y_train, y_test):
  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)

  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  return {"Mean Absolute Error": mae, "R2 Score": r2}

#### Linear Regression Pipeline

In [25]:
lr_pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)

In [28]:
lr_results = eval_model(lr_pipeline, X_train, X_test, y_train, y_test)



In [29]:
lr_results

{'Mean Absolute Error': 16.322211923100607, 'R2 Score': 0.6134466255527352}

#### Ridge Regression Pipeline

In [30]:
ridge_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge(
        alpha=1.0,
        random_state=42
    ))
])

In [31]:
ridge_results = eval_model(ridge_pipeline, X_train, X_test, y_train, y_test)



In [32]:
ridge_results

{'Mean Absolute Error': 16.321065299110487, 'R2 Score': 0.6133704002285385}

#### Lasso Regression Pipeline

In [33]:
lasso_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Lasso(
        alpha=0.01,
        random_state=42,
        max_iter=10000
    ))
])

In [34]:
lasso_results = eval_model(lasso_pipeline, X_train, X_test, y_train, y_test)



In [35]:
lasso_results

{'Mean Absolute Error': 16.293092088984285, 'R2 Score': 0.6132698018138527}

#### ElasticNet Pipeline

In [36]:
elasticnet_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", ElasticNet(
        alpha=0.1,        # overall regularization strength
        l1_ratio=0.5,     # mix between L1 and L2 (0=L2, 1=L1)
        random_state=42,
        max_iter=10000    # ensure convergence
    ))
])

In [37]:
elasticnet_results = eval_model(elasticnet_pipeline, X_train, X_test, y_train, y_test)



In [38]:
elasticnet_results

{'Mean Absolute Error': 16.74397284213921, 'R2 Score': 0.5593196177851423}

#### Decision Tree Pipeline

In [39]:
dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(
        max_depth=10,       # limit depth to avoid overfitting
        min_samples_leaf=5, # prevent very small leaves
        random_state=42
    ))
])

In [40]:
dt_results = eval_model(dt_pipeline, X_train, X_test, y_train, y_test)



In [41]:
dt_results

{'Mean Absolute Error': 7.0079914958461185, 'R2 Score': 0.8572364785117008}

#### Random Forest Pipeline

In [42]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=500,       # number of trees
        max_depth=15,           # limit depth to avoid overfitting
        min_samples_leaf=5,
        max_features="sqrt",    # features considered at each split
        n_jobs=-1,              # parallelize
        random_state=42
    ))
])

In [43]:
rf_results = eval_model(rf_pipeline, X_train, X_test, y_train, y_test)



In [44]:
rf_results

{'Mean Absolute Error': 7.323542301957356, 'R2 Score': 0.8658887523144212}

In [46]:
adaboost_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", AdaBoostRegressor(
        estimator=DecisionTreeRegressor(max_depth=4),
        n_estimators=200,
        learning_rate=0.1,
        random_state=42
    ))
])

In [47]:
adaboost_results = eval_model(adaboost_pipeline, X_train, X_test, y_train, y_test)



In [48]:
adaboost_results

{'Mean Absolute Error': 23.614958520704207, 'R2 Score': 0.6247240280655244}

#### XGBoost Pipeline

In [50]:
xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ))
])

In [51]:
xgb_results = eval_model(xgb_pipeline, X_train, X_test, y_train, y_test)



In [52]:
xgb_results

{'Mean Absolute Error': 5.1555290523482835, 'R2 Score': 0.8957356864095991}

#### LightGBM Pipeline

In [53]:
lgbm_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ))
])

In [57]:
lgbm_results = eval_model(lgbm_pipeline, X_train, X_test, y_train, y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 47542, number of used features: 48
[LightGBM] [Info] Start training from score 42.515597


In [58]:
lgbm_results

{'Mean Absolute Error': 5.5156707666654, 'R2 Score': 0.9030619154416788}

#### LGBM Cross-Validation

In [60]:
mae_scores = -cross_val_score(
    lgbm_pipeline,
    X_train, y_train,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

In [61]:
r2_scores = cross_val_score(
    lgbm_pipeline,
    X_train, y_train,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

In [62]:
print("Cross-validated MAE:", np.mean(mae_scores))
print("Cross-validated R2:", np.mean(r2_scores))

Cross-validated MAE: 5.605050945639161
Cross-validated R2: 0.9033323300376445


#### LGBM HyperParameter Tuning

In [63]:
param_grid = {
    "regressor__n_estimators": [300, 500, 700],
    "regressor__learning_rate": [0.01, 0.05, 0.1],
    "regressor__max_depth": [4, 6, 8],
    "regressor__num_leaves": [31, 50, 70],
    "regressor__subsample": [0.7, 0.8, 0.9],
    "regressor__colsample_bytree": [0.7, 0.8, 0.9]
}

In [64]:
random_search = RandomizedSearchCV(
    lgbm_pipeline,
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [65]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 47542, number of used features: 48
[LightGBM] [Info] Start training from score 42.515597


In [66]:
print("Best Parameters:", random_search.best_params_)
print("Best CV R2:", random_search.best_score_)

Best Parameters: {'regressor__subsample': 0.7, 'regressor__num_leaves': 31, 'regressor__n_estimators': 500, 'regressor__max_depth': 8, 'regressor__learning_rate': 0.1, 'regressor__colsample_bytree': 0.8}
Best CV R2: 0.9009956256781876


#### Final LGBM Pipeline

In [67]:
final_lgbm_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.1,
        max_depth=8,
        num_leaves=31,
        subsample=0.7,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ))
])

In [68]:
final_lgbm_results = eval_model(final_lgbm_pipeline, X_train, X_test, y_train, y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007866 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 47542, number of used features: 48
[LightGBM] [Info] Start training from score 42.515597


In [69]:
final_lgbm_results

{'Mean Absolute Error': 5.143706743162843, 'R2 Score': 0.9036960598935401}

In [71]:
import joblib
joblib.dump(final_lgbm_pipeline, "lgbm_pipeline_final.pkl")

['lgbm_pipeline_final.pkl']