In [3]:
import pandas as pd
import numpy as np

In [4]:
dataset = pd.read_csv('vehicles_dataset.csv')

In [5]:
dataset.head(1)

Unnamed: 0,name,description,make,model,type,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,New,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive


In [6]:
dataset.columns

Index(['name', 'description', 'make', 'model', 'type', 'year', 'price',
       'engine', 'cylinders', 'fuel', 'mileage', 'transmission', 'trim',
       'body', 'doors', 'exterior_color', 'interior_color', 'drivetrain'],
      dtype='object')

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     946 non-null    object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   type            1002 non-null   object 
 5   year            1002 non-null   int64  
 6   price           979 non-null    float64
 7   engine          1000 non-null   object 
 8   cylinders       897 non-null    float64
 9   fuel            995 non-null    object 
 10  mileage         968 non-null    float64
 11  transmission    1000 non-null   object 
 12  trim            1001 non-null   object 
 13  body            999 non-null    object 
 14  doors           995 non-null    float64
 15  exterior_color  997 non-null    object 
 16  interior_color  964 non-null    object 
 17  drivetrain      1002 non-null   o

In [8]:
dataset.isnull().sum()

name                0
description        56
make                0
model               0
type                0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64

In [9]:
dataset_copy = dataset.copy()

In [10]:
dataset.drop(columns=['name','description'],inplace=True)

In [11]:
dataset.isnull().sum()

make                0
model               0
type                0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64

In [12]:
dataset.shape

(1002, 16)

In [13]:
dataset.dropna(subset=['price'],inplace=True)

In [14]:
mode_engine = dataset['engine'].mode()

In [15]:
x = dataset.drop(columns=['price'])
y = dataset['price']

In [16]:
categorical_col = ['make','model','type','engine','fuel','transmission','trim','body','exterior_color','interior_color','drivetrain']
numeric_col = [col for col in x.columns if col not in categorical_col]

In [17]:
numeric_col

['year', 'cylinders', 'mileage', 'doors']

In [18]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 979 entries, 0 to 1001
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   make            979 non-null    object 
 1   model           979 non-null    object 
 2   type            979 non-null    object 
 3   year            979 non-null    int64  
 4   engine          977 non-null    object 
 5   cylinders       877 non-null    float64
 6   fuel            972 non-null    object 
 7   mileage         945 non-null    float64
 8   transmission    977 non-null    object 
 9   trim            978 non-null    object 
 10  body            976 non-null    object 
 11  doors           972 non-null    float64
 12  exterior_color  974 non-null    object 
 13  interior_color  942 non-null    object 
 14  drivetrain      979 non-null    object 
dtypes: float64(3), int64(1), object(11)
memory usage: 122.4+ KB


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
 

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
from sklearn.compose import ColumnTransformer

In [22]:
categorical_transform = Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='most_frequent')),
        ('onehotencode',OneHotEncoder(handle_unknown='ignore'))
    ]
)

numeric_transform = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaing',StandardScaler())
])



preproccess = ColumnTransformer(transformers=[
    ('num',numeric_transform,numeric_col),
    ('categorical',categorical_transform,categorical_col)
])


In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
model = Pipeline(steps=[
    ('preprocess',preproccess),
    ('regressionmodel',RandomForestRegressor())
])

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
xt,xte,yt,yte = train_test_split(x,y,random_state=42,test_size=0.2)

In [27]:
model.fit(xt,yt)

In [28]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [29]:
predictions = model.predict(xte)

In [30]:
mean_absolute_error(yte,predictions)

4710.934894301986

In [31]:
mean_squared_error(yte,predictions)

72330942.30127208

In [32]:
baseline_pred = yt.mean()

In [33]:
baseline_pred

50193.708812260535

In [34]:
(4687/baseline_pred)*100

9.337823625528012

In [35]:

baseline_mae = mean_absolute_error(yte, [baseline_pred] * len(yte))
print(f"Baseline MAE: {baseline_mae:.4f}")

Baseline MAE: 13242.9835


In [36]:
import pickle

In [37]:
pickle.dump(model,open('vehicle_model.pkl','wb'))

In [38]:
xt.columns

Index(['make', 'model', 'type', 'year', 'engine', 'cylinders', 'fuel',
       'mileage', 'transmission', 'trim', 'body', 'doors', 'exterior_color',
       'interior_color', 'drivetrain'],
      dtype='object')

In [39]:
xt['engine'].unique()

array(['16V GDI DOHC', '16V GDI DOHC Turbo', 'c',
       '24V PDI DOHC Flexible Fuel', 'c ZEV 320hp',
       '32V DDI OHV Turbo Diesel', '16V GDI DOHC Turbo Hybrid',
       '16V GDI OHV',
       '6.7L I-6 diesel direct injection, VVT intercooled turbo,',
       '24V DDI OHV Turbo Diesel', '16V GDI DOHC Hybrid', '16V PDI DOHC',
       '24V GDI SOHC',
       'DOHC, D-CVVT variable valve control, regular unleaded, en',
       '16V MPFI DOHC',
       'o 1.5L I-3 gasoline direct injection, DOHC, CVTCS variabl',
       'ce 5.6L V-8 gasoline direct injection, DOHC, variable val',
       'oled Turbo Diesel I-6 6.7 L/408',
       '4 gasoline direct injection, DOHC, variable valve control',
       '16V MPFI OHV',
       '6.2L V-8 gasoline direct injection, variable valve contr',
       '16V DDI DOHC Turbo Diesel', 'der', '24V GDI DOHC Twin Turbo',
       '24V MPFI DOHC', '24V GDI DOHC Turbo', 'DOHC 16V LEV3-SULEV30',
       'ar 3.6L V-6 DOHC, variable valve control, regular unleade', 'OHV',
    

In [40]:
xt['make'].value_counts()

make
Jeep             155
Dodge             88
Hyundai           84
RAM               67
Ford              64
Chevrolet         45
Kia               43
Volkswagen        34
Nissan            33
Mazda             33
Mercedes-Benz     20
Honda             17
GMC               16
BMW               15
Audi              13
Cadillac          10
Acura              8
Chrysler           8
Buick              7
Subaru             5
Toyota             5
INFINITI           4
Lincoln            2
Volvo              2
Land Rover         2
Genesis            2
Lexus              1
Name: count, dtype: int64

In [41]:
xt['body'].unique()

array(['SUV', 'Cargo Van', 'Sedan', 'Pickup Truck', 'Hatchback',
       'Passenger Van', nan, 'Convertible'], dtype=object)

In [42]:
xt['cylinders'].unique()

array([ 4., nan,  6.,  8.,  3.])

In [43]:
xt['doors'].value_counts()

doors
4.0    740
3.0     28
2.0      8
Name: count, dtype: int64

In [44]:
xt.columns

Index(['make', 'model', 'type', 'year', 'engine', 'cylinders', 'fuel',
       'mileage', 'transmission', 'trim', 'body', 'doors', 'exterior_color',
       'interior_color', 'drivetrain'],
      dtype='object')

In [52]:
unique_values = {
    'make': xt['make'].unique(),
    'model': xt['model'].unique(),
    'type': xt['type'].unique(),
    'fuel': xt['fuel'].unique(),
    'transmission': xt['transmission'].unique(),
    'trim': xt['trim'].unique(),
    'body': xt['body'].unique(),
    'exterior_color': xt['exterior_color'].unique(),
    'interior_color': xt['interior_color'].unique(),
    'drivetrain': xt['drivetrain'].unique(),
    'doors':xt['doors'].unique(),
    'cylinders':xt['cylinders'].unique(),
    'engine':xt['engine'].unique(),
    'year':xt['year'].unique()
}

In [49]:
xt['engine'].nunique()

91

In [53]:
pickle.dump(unique_values,open('unique_values.pkl','wb'))