# ðŸš— Rusty Bargain: Car Price Prediction Project

This project predicts used car prices for Rusty Bargain using various machine learning models. Models are compared based on RMSE, training time, and prediction speed.

In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation


ModuleNotFoundError: No module named 'lightgbm'

In [None]:
df = pd.read_csv('/workspaces/Used-Car-Price-Model/car_data.csv')
df = df.drop(['DateCrawled', 'DateCreated', 'LastSeen', 'NumberOfPictures', 'PostalCode'], axis=1)
df = df[(df['Power'] > 0) & (df['Power'] < 500)]
df = df[(df['RegistrationYear'] >= 1950) & (df['RegistrationYear'] <= 2025)]

cat_cols = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired']
for col in cat_cols:
    df[col] = df[col].fillna('unknown').astype('category')

target = 'Price'
features = df.drop(columns=[target])
target = df[target]
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.25, random_state=42)


In [None]:
cat_features = X_train.select_dtypes(include='category').columns.tolist()
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(cat_features)
print(num_features)

['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired']
['RegistrationYear', 'Power', 'Mileage', 'RegistrationMonth']


In [None]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
    ('num', 'passthrough', num_features)
])

lr_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', LinearRegression())
])

start_train = time.time()
lr_pipeline.fit(X_train, y_train)
train_time = time.time() - start_train

start_pred = time.time()
y_pred = lr_pipeline.predict(X_valid)
pred_time = time.time() - start_pred
rmse = mean_squared_error(y_valid, y_pred, squared=False)

print(f"ðŸ”¹ Linear Regression\nTrain time: {train_time:.2f} sec\nPrediction time: {pred_time:.2f} sec\nRMSE: {rmse:.2f}")
X_train_encoded = preprocessor.fit_transform(X_train)
X_valid_encoded = preprocessor.transform(X_valid)




ðŸ”¹ Linear Regression
Train time: 7.89 sec
Prediction time: 0.49 sec
RMSE: 1735955.99


In [None]:
tree_model = DecisionTreeRegressor(max_depth=12, random_state=42)
start_train = time.time()
tree_model.fit(X_train_encoded, y_train)
train_time = time.time() - start_train

start_pred = time.time()
y_pred = tree_model.predict(X_valid_encoded)
pred_time = time.time() - start_pred
rmse = mean_squared_error(y_valid, y_pred, squared=False)

print(f"ðŸŒ³ Decision Tree\nTrain time: {train_time:.2f} sec\nPrediction time: {pred_time:.2f} sec\nRMSE: {rmse:.2f}")


ðŸŒ³ Decision Tree
Train time: 16.60 sec
Prediction time: 0.12 sec
RMSE: 1985.21




In [None]:
rf_model = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42, n_jobs=-1)
start_train = time.time()
rf_model.fit(X_train_encoded, y_train)
train_time = time.time() - start_train

start_pred = time.time()
y_pred = rf_model.predict(X_valid_encoded)
pred_time = time.time() - start_pred
rmse = mean_squared_error(y_valid, y_pred, squared=False)

print(f"ðŸŒ² Random Forest\nTrain time: {train_time:.2f} sec\nPrediction time: {pred_time:.2f} sec\nRMSE: {rmse:.2f}")


ðŸŒ² Random Forest
Train time: 216.89 sec
Prediction time: 0.58 sec
RMSE: 1863.99




In [None]:
for col in cat_features:
    X_train[col] = X_train[col].astype('category')
    X_valid[col] = X_valid[col].astype('category')

lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 12,
    'num_leaves': 31,
    'verbose': -1,
    'random_state': 42
}

start_train = time.time()
lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=[lgb_valid],
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=20)
    ]
)
train_time = time.time() - start_train

start_pred = time.time()
y_pred = lgb_model.predict(X_valid)
pred_time = time.time() - start_pred
rmse = mean_squared_error(y_valid, y_pred, squared=False)

print(f"âš¡ LightGBM\nTrain time: {train_time:.2f} sec\nPrediction time: {pred_time:.2f} sec\nRMSE: {rmse:.2f}")


Training until validation scores don't improve for 10 rounds
[20]	valid_0's rmse: 2035.76
[40]	valid_0's rmse: 1801
[60]	valid_0's rmse: 1751.12
[80]	valid_0's rmse: 1728.21
[100]	valid_0's rmse: 1715.29
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1715.29
âš¡ LightGBM
Train time: 2.22 sec
Prediction time: 0.23 sec
RMSE: 1715.29


