In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import influence_plot
import numpy as np

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("vehicledata.csv")
data

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,2024 Mercedes-Benz Sprinter 2500 Standard Roof,2024 Mercedes-Benz Sprinter 2500 Cargo 144 WB ...,Mercedes-Benz,Sprinter 2500,2024,59037.0,16V DDI DOHC Turbo Diesel,4.0,Diesel,10.0,9-Speed Automatic,Standard Roof,Cargo Van,3.0,Arctic White,Black,Rear-wheel Drive
998,2024 Dodge Hornet Hornet R/T Plus Eawd,Dealer Comments +++ Price Ends 5/31/2024 +++ A...,Dodge,Hornet,2024,49720.0,"4 gasoline direct injection, DOHC, Multiair va...",4.0,Gasoline,0.0,6-Spd Aisin F21-250 PHEV Auto Trans,Hornet R/T Plus Eawd,SUV,4.0,Acapulco Gold,Black,All-wheel Drive
999,2024 Jeep Wagoneer Base,\n \n The ALL New Friendship CDJR ...,Jeep,Wagoneer,2024,69085.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,20.0,8-Speed Automatic,Base,SUV,4.0,Diamond Black,Black,Four-wheel Drive
1000,2024 Nissan Murano SV Intelligent AWD,"\n \n CVT with Xtronic, AWD.At Tod...",Nissan,Murano,2024,43495.0,"6 DOHC, variable valve control, regular unlead...",6.0,Gasoline,6.0,Automatic,SV Intelligent AWD,SUV,4.0,Pearl White Tricoat,Graphite,All-wheel Drive


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     946 non-null    object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   year            1002 non-null   int64  
 5   price           979 non-null    float64
 6   engine          1000 non-null   object 
 7   cylinders       897 non-null    float64
 8   fuel            995 non-null    object 
 9   mileage         968 non-null    float64
 10  transmission    1000 non-null   object 
 11  trim            1001 non-null   object 
 12  body            999 non-null    object 
 13  doors           995 non-null    float64
 14  exterior_color  997 non-null    object 
 15  interior_color  964 non-null    object 
 16  drivetrain      1002 non-null   object 
dtypes: float64(4), int64(1), object(1

In [5]:
# check for missing values
data.isna().sum()

name                0
description        56
make                0
model               0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64

In [7]:
#drop irrelevant columns

df = data.drop(columns=['name', 'description'])

In [8]:
df

Unnamed: 0,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,Mercedes-Benz,Sprinter 2500,2024,59037.0,16V DDI DOHC Turbo Diesel,4.0,Diesel,10.0,9-Speed Automatic,Standard Roof,Cargo Van,3.0,Arctic White,Black,Rear-wheel Drive
998,Dodge,Hornet,2024,49720.0,"4 gasoline direct injection, DOHC, Multiair va...",4.0,Gasoline,0.0,6-Spd Aisin F21-250 PHEV Auto Trans,Hornet R/T Plus Eawd,SUV,4.0,Acapulco Gold,Black,All-wheel Drive
999,Jeep,Wagoneer,2024,69085.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,20.0,8-Speed Automatic,Base,SUV,4.0,Diamond Black,Black,Four-wheel Drive
1000,Nissan,Murano,2024,43495.0,"6 DOHC, variable valve control, regular unlead...",6.0,Gasoline,6.0,Automatic,SV Intelligent AWD,SUV,4.0,Pearl White Tricoat,Graphite,All-wheel Drive


In [10]:
# Drop irrelevant columns
df = data.drop(columns=['name', 'description'])

# Drop rows with missing price (critical for training)
df = df.dropna(subset=['price']).copy() 

# Impute numerical columns with median
num_cols = ['cylinders', 'mileage', 'doors']
df.loc[:, num_cols] = df[num_cols].fillna(df[num_cols].median())

# Impute categorical columns with mode
cat_cols = ['engine', 'transmission', 'trim', 'body', 'exterior_color', 'interior_color']
df.loc[:, cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


In [11]:
# Make surew working on a fresh copy
df = df.copy()

# Create vehicle_age (current year - manufacturing year)
df['vehicle_age'] = 2025 - df['year']

# Extract engine type (e.g., "V6" → 6)
df['engine_type'] = df['engine'].str.extract(r'(\d+)').astype(float)

# Drop original 'engine' column if it's no longer needed
df.drop(columns=['engine'], inplace=True)


In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# One-hot encode low-cardinality categorical features
df = pd.get_dummies(df, columns=['fuel', 'transmission', 'drivetrain', 'body'], drop_first=True)

# Label encode high-cardinality categorical features
label_cols = ['make', 'model', 'trim', 'exterior_color', 'interior_color']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [13]:
# Define the feature columns
features = [
    'make', 'model', 'vehicle_age', 'mileage', 'cylinders', 
    'doors', 'engine_type', 'fuel_Gasoline', 'transmission_Automatic', 
    'drivetrain_Front-wheel Drive', 'body_Sedan'
]

# Split into features (X) and target variable (y)
X = df[features]
y = df['price']

# Peek at the shapes to be sure
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (979, 11)
y shape: (979,)


### RandomForest Model

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Step 1: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 2: Initialize and train the model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Step 3: Predict on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" Mean Absolute Error (MAE): {mae:.2f}")
print(f" R² Score: {r2:.2f}")


 Mean Absolute Error (MAE): 5680.21
 R² Score: 0.76


### Observations

* R² Score: 0.76 → Model explains 76% of the variance in car prices
* MAE: 5680.21 → On average, your model's predictions are about 5.6K off from the actual car price

### Hyperparameter Tuning with GridSearchCV (Optimized RF)

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
print("Best params:", grid_search.best_params_)

Best params: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}


In [16]:
# Use the best parameters from GridSearchCV
optimized_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    random_state=42
)

# Retrain with optimized params
optimized_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_optimized = optimized_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, r2_score
mae_opt = mean_absolute_error(y_test, y_pred_optimized)
r2_opt = r2_score(y_test, y_pred_optimized)

print(f" Optimized MAE: {mae_opt:.2f}")
print(f" Optimized R² Score: {r2_opt:.2f}")


 Optimized MAE: 5761.53
 Optimized R² Score: 0.75


### Observations
* The hyperparameter tuning (with the parameters) did not significantly improve model’s performance

### XGBoost Model

In [17]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

print(f"XGBoost MAE: {mean_absolute_error(y_test, xgb_preds):.2f}")
print(f"XGBoost R² Score: {r2_score(y_test, xgb_preds):.2f}")

XGBoost MAE: 5170.58
XGBoost R² Score: 0.73


### Observations
* MAE dropped to 5170.58 → best so far!
* R² dipped to 0.73 → slightly lower than RF 
- This means:
* XGBoost is more accurate in predicting prices on average 
* But it might be playing it safer—less confident on high/low extremes (hence the lower R²)

### Tuning XGBoost

In [18]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2]
}

grid = GridSearchCV(XGBRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)


Best params: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300}


In [19]:
from xgboost import XGBRegressor

best_xgb = XGBRegressor(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=300,
    random_state=42
)

best_xgb.fit(X_train, y_train)
y_pred = best_xgb.predict(X_test)

from sklearn.metrics import mean_absolute_error, r2_score
print(f"Optimized XGBoost MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"Optimized XGBoost R² Score: {r2_score(y_test, y_pred):.2f}")


Optimized XGBoost MAE: 5300.57
Optimized XGBoost R² Score: 0.73


### Observations
* Basic XGBoost is chosen as the final model due to:
* Lowest MAE (more accurate price prediction)
* Competitive R² score
* Faster training with default params

In [21]:
import joblib

# Save it
joblib.dump(xgb_model, 'vehicle_price_xgb_model.pkl')

# Later, to load it:
# xgb_model = joblib.load('vehicle_price_xgb_model.pkl')

['vehicle_price_xgb_model.pkl']

In [1]:
pwd

'C:\\Users\\akshitha alluri\\VehiclePricePrediction'