VEHICLE PRICE PREDICTOR

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
print("Dataset shape:", df.shape)
print("Missing values in price:", df['price'].isnull().sum())

Dataset shape: (1002, 17)
Missing values in price: 23


In [4]:
df = df.dropna(subset=['price'])
df = df[df['price'] > 0]

In [5]:
features = ['make', 'model', 'year', 'engine', 'cylinders', 'fuel', 'mileage', 'transmission', 'drivetrain']
target = 'price'

In [6]:
for col in features:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')
        if col in ['year', 'cylinders', 'mileage']:
            df[col] = pd.to_numeric(df[col], errors='coerce')

In [7]:
for col in ['make', 'model', 'engine']:
    if col in df.columns:
        # Keep only top 20 categories, group others as 'Other'
        top_categories = df[col].value_counts().head(20).index
        df[col] = df[col].apply(lambda x: x if x in top_categories else 'Other')

In [8]:
Q1 = df[target].quantile(0.25)
Q3 = df[target].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]

print(f"After cleaning: {df.shape[0]} rows")

After cleaning: 953 rows


In [9]:
X = df[features].copy()
y = df[target].copy()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
numeric_features = ['year', 'cylinders', 'mileage']
categorical_features = ['make', 'model', 'engine', 'fuel', 'transmission', 'drivetrain']

In [12]:
numeric_features = ['year', 'cylinders', 'mileage']
categorical_features = ['make', 'model', 'engine', 'fuel', 'transmission', 'drivetrain']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=50))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [14]:
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge', Ridge(alpha=1.0)),
    ('Lasso', Lasso(alpha=1.0)),
    ('Decision Tree', DecisionTreeRegressor(random_state=42, max_depth=20)),
    ('Random Forest', RandomForestRegressor(random_state=42, n_estimators=100)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42, n_estimators=100)),
    ('XGBoost', XGBRegressor(random_state=42, n_estimators=100)),
    ('KNN', KNeighborsRegressor(n_neighbors=5)),
    ('SVR', SVR(kernel='rbf', C=1000, gamma='scale'))
]

In [15]:
results = []
best_score = -np.inf
best_model_name = None
best_pipeline = None

print("Evaluating models...")
for name, model in models:
    print(f"Training {name}...")

Evaluating models...
Training Linear Regression...
Training Ridge...
Training Lasso...
Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
Training XGBoost...
Training KNN...
Training SVR...


In [16]:
pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

In [17]:
pipe.fit(X_train, y_train)

In [18]:
y_pred = pipe.predict(X_test)

In [21]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [22]:
cv_scores = cross_val_score(pipe, X_train, y_train, cv=3, scoring='r2')

In [23]:
results.append({
            'Model': name,
            'RMSE': round(rmse, 2),
            'MAE': round(mae, 2),
            'R²': round(r2, 3),
            'CV R² Mean': round(cv_scores.mean(), 3),
            'CV R² Std': round(cv_scores.std(), 3)
        })

In [24]:
if r2 > best_score:
            best_score = r2
            best_model_name = name
            best_pipeline = pipe

In [26]:
try:
    # Place the code that may raise an exception here
    # For example:
    # pipe.fit(X_train, y_train)
    # y_pred = pipe.predict(X_test)
    pass  # Remove this and add your code above
except Exception as e:
    print(f"Error with {name}: {e}")
    # If inside a loop, you can use 'continue'
    # continue

In [27]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R²', ascending=False).reset_index(drop=True)
print("\nModel Comparison Results:")
print(results_df.to_string(index=False))

print(f"\nBest Model: {best_model_name} with R² = {best_score:.3f}")


Model Comparison Results:
Model    RMSE     MAE    R²  CV R² Mean  CV R² Std
  SVR 9938.29 6969.46 0.572       0.495      0.012

Best Model: SVR with R² = 0.572


In [28]:
print(f"\nHypertuning {best_model_name}...")

if best_model_name == 'Random Forest':
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [10, 20, None],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }


Hypertuning SVR...


In [30]:
if best_model_name == 'XGBoost':
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 6, 10],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__subsample': [0.8, 0.9, 1.0]
    }

In [32]:
if best_model_name == 'Gradient Boosting':
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 6, 10],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__subsample': [0.8, 0.9, 1.0]
    }

In [33]:
if best_model_name == 'SVR':
    param_grid = {
        'model__C': [1, 10, 100, 1000],
        'model__gamma': ['scale', 'auto', 0.001, 0.01],
        'model__kernel': ['rbf', 'linear']
    }
else:
    param_grid = {}

In [34]:
if param_grid:
    # Recreate pipeline for hypertuning
    best_model_class = next(model for name, model in models if name == best_model_name)
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', best_model_class)
    ])

In [36]:
grid_search = GridSearchCV(
    pipe, 
    param_grid, 
    cv=3, 
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [39]:
final_pred = grid_search.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
final_mae = mean_absolute_error(y_test, final_pred)
final_r2 = r2_score(y_test, final_pred)

In [40]:
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"\nFinal Model Performance:")
print(f"RMSE: {final_rmse:.2f}")
print(f"MAE: {final_mae:.2f}")
print(f"R²: {final_r2:.3f}")


Best Parameters: {'model__C': 1000, 'model__gamma': 'scale', 'model__kernel': 'linear'}

Final Model Performance:
RMSE: 7440.72
MAE: 5326.82
R²: 0.760


In [41]:
import joblib
joblib.dump(grid_search.best_estimator_, 'best_vehicle_price_model.pkl')
print("\nBest model saved as 'best_vehicle_price_model.pkl'")


Best model saved as 'best_vehicle_price_model.pkl'


In [43]:
if hasattr(grid_search.best_estimator_.named_steps['model'], 'feature_importances_'):
	print(grid_search.best_estimator_.named_steps['model'].feature_importances_)

In [45]:
model = grid_search.best_estimator_.named_steps['model']

if hasattr(model, "feature_importances_"):
    feature_names = list(
        grid_search.best_estimator_.named_steps['preprocessor']
            .named_transformers_['cat']
            .named_steps['onehot']
            .get_feature_names_out(categorical_features)
    )
    importances = model.feature_importances_
    feature_imp = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)

    print(f"\nTop 10 Feature Importances:")
    print(feature_imp.head(10).to_string(index=False))
else:
    print("Feature importances are not available for this model (e.g., SVR).")

Feature importances are not available for this model (e.g., SVR).


In [46]:
models = [
    ('Random Forest', RandomForestRegressor(random_state=42, n_estimators=100)),
    ('XGBoost', XGBRegressor(random_state=42, n_estimators=100)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42, n_estimators=100)),
    ('Linear Regression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42, max_depth=15)),
    ('KNN', KNeighborsRegressor(n_neighbors=5))
]

In [48]:
results = []
for name, model in models:
    pipe = Pipeline([('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'RMSE': round(rmse, 2),
        'MAE': round(mae, 2),
        'R²': round(r2, 3)
    })

In [49]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R²', ascending=False)
print(results_df.to_string(index=False))

            Model    RMSE     MAE    R²
    Random Forest 5684.60 4027.71 0.860
            Ridge 5972.98 4672.66 0.846
Linear Regression 6039.96 4639.20 0.842
          XGBoost 6213.25 4326.48 0.833
Gradient Boosting 6242.77 4820.62 0.831
              KNN 6553.41 4834.20 0.814
    Decision Tree 7426.16 4995.88 0.761


In [50]:
best_model = results_df.iloc[0]
print(f"\nRecommended Model: {best_model['Model']}")
print(f"R² Score: {best_model['R²']}")
print(f"RMSE: ${best_model['RMSE']:,.2f}")


Recommended Model: Random Forest
R² Score: 0.86
RMSE: $5,684.60
