VEHICLE PRICE PREDICTOR

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('cleaned_vehicles.csv')

In [6]:
features = ['make', 'model', 'year', 'engine', 'cylinders', 'fuel', 'mileage', 'transmission', 'drivetrain']
target = 'price'

In [7]:
df = df.dropna(subset=[target])
X = df[features]
y = df[target]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
numeric_features = ['year', 'cylinders', 'mileage']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['make', 'model', 'engine', 'fuel', 'transmission', 'drivetrain']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42)),
    ('XGBoost', XGBRegressor(random_state=42)),
    ('KNN', KNeighborsRegressor()),
    ('SVR', SVR(kernel='rbf'))
]


In [16]:
import time

results = []
for name, model in models:
    start_time = time.time()
    
    # Create pipeline
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

In [18]:
pipe.fit(X_train, y_train)

In [19]:
y_pred = pipe.predict(X_test)

In [22]:
from sklearn.metrics import mean_absolute_error, explained_variance_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

In [24]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='r2')

In [25]:
training_time = time.time() - start_time

In [30]:
results.append({
        'Model': name,
        'RMSE': round(rmse, 2),
        'MAE': round(mae, 2),
        'R²': round(r2, 2),
        'Explained Variance': round(evs, 2),
        'CV R² Mean': round(cv_scores.mean(), 2),
        'CV R² Std': round(cv_scores.std(), 2),
        'Training Time (s)': round(training_time, 2)
    })

In [31]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('RMSE').reset_index(drop=True)
print(results_df.to_string(index=False))

Model     RMSE      MAE    R²  Explained Variance  CV R² Mean  CV R² Std  Training Time (s)
  SVR 17705.43 12835.41 -0.03                 0.0       -0.03       0.02             150.15
  SVR 17705.43 12835.41 -0.03                 0.0       -0.03       0.02             150.15


In [32]:
results_df = pd.DataFrame(results)
print(results_df)

  Model      RMSE       MAE    R²  Explained Variance  CV R² Mean  CV R² Std  \
0   SVR  17705.43  12835.41 -0.03                 0.0       -0.03       0.02   
1   SVR  17705.43  12835.41 -0.03                 0.0       -0.03       0.02   

   Training Time (s)  
0             150.15  
1             150.15  
