VEHICLE PRICE PREDICTOR

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
features = ['make', 'model', 'year', 'engine', 'cylinders', 'fuel', 'mileage', 'transmission', 'drivetrain']
target = 'price'

In [4]:
df = df.dropna(subset=[target])
df = df[df[target] > 0]
for col in ['year', 'cylinders', 'mileage']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna(subset=features)

In [5]:
for col in ['make', 'model', 'engine']:
    top = df[col].value_counts().nlargest(15).index
    df[col] = df[col].where(df[col].isin(top), 'Other')

In [6]:
Q1 = df[target].quantile(0.25)
Q3 = df[target].quantile(0.75)
IQR = Q3 - Q1
df = df[(df[target] >= Q1 - 1.5 * IQR) & (df[target] <= Q3 + 1.5 * IQR)]

In [7]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
numeric_features = ['year', 'cylinders', 'mileage']
categorical_features = ['make', 'model', 'engine', 'fuel', 'transmission', 'drivetrain']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [9]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestRegressor(random_state=42))
])

In [11]:
param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

In [12]:
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [14]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [15]:
print("Best Parameters:", grid_search.best_params_)
print(f"Test RMSE: {rmse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test R²: {r2:.3f}")

Best Parameters: {'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
Test RMSE: 6872.78
Test MAE: 4861.53
Test R²: 0.803


In [16]:
import joblib
joblib.dump(best_model, 'best_random_forest_vehicle_price.pkl')
print("Model saved as 'best_random_forest_vehicle_price.pkl'")

Model saved as 'best_random_forest_vehicle_price.pkl'
