# Car Price Predictor

In [3]:
import pandas as pd

In [4]:
# calling the 'car_prices' dataset using pandas
ds = pd.read_csv("C:/Users/abder/Downloads/archive/car_prices.csv")
ds.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


In [5]:
#extracting 'saleyear' from 'saledate'
ds['saledate'] = ds['saledate'].str.replace(r' GMT[-+]\d{4} \([A-Z]+\)', '', regex=True)
ds['saledate'] = pd.to_datetime(ds['saledate'], format='%a %b %d %Y %H:%M:%S', errors='coerce')
ds['saleyear'] = ds['saledate'].dt.year
ds.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate,saleyear
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,2014-12-16 12:30:00,2014.0
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,2014-12-16 12:30:00,2014.0
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,2015-01-15 04:30:00,2015.0
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,2015-01-29 04:30:00,2015.0
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,2014-12-18 12:30:00,2014.0


In [6]:
ds.count()

year            558837
make            548536
model           548438
trim            548186
body            545642
transmission    493485
vin             558833
state           558837
condition       547017
odometer        558743
color           558088
interior        558088
seller          558837
mmr             558799
sellingprice    558825
saledate        558799
saleyear        558799
dtype: int64

In [7]:
#droping missing values
ds.dropna(inplace=True)
ds.reset_index(drop=True, inplace=True)

In [8]:
ds.count()

year            472325
make            472325
model           472325
trim            472325
body            472325
transmission    472325
vin             472325
state           472325
condition       472325
odometer        472325
color           472325
interior        472325
seller          472325
mmr             472325
sellingprice    472325
saledate        472325
saleyear        472325
dtype: int64

In [9]:
#removing unaicessary features
ds = ds.drop(columns=["vin", "saledate"])

In [10]:
#transforming categorical features to numerical features
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_features = ['make','model','trim','body','transmission','state','color','interior','seller']
for col in categorical_features:
    ds[col] = encoder.fit_transform(ds[col])

In [11]:
ds.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,seller,mmr,sellingprice,saleyear
0,2015,24,637,818,35,0,2,5.0,16639.0,17,1,5943,20500.0,21500.0,2014.0
1,2015,24,637,818,35,0,2,5.0,9393.0,17,0,5943,20800.0,21500.0,2014.0
2,2014,3,8,253,36,0,2,45.0,1331.0,7,1,4090,31900.0,30000.0,2015.0
3,2015,51,575,1212,36,0,2,41.0,14282.0,17,1,11541,27500.0,27750.0,2015.0
4,2014,3,33,335,36,0,2,43.0,2641.0,7,1,4090,66000.0,67000.0,2014.0


In [12]:
#defining inputs and outputs
x = ds.drop(columns = ['sellingprice'])
y = ds['sellingprice']

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#splitting data into two sets (training/test)
x_train , x_test , y_train, y_test = train_test_split(x, y, test_size = 0.2 , random_state = 1)

#scaling features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Linear Regression Model

In [15]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(x_train_scaled,y_train)

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

yhat = model.predict(x_test_scaled)
mse = mean_squared_error(y_test, yhat)
r2 = r2_score(y_test, yhat)
print(f"R-squared: {r2}")
print(f"Mean Squared Error : {mse}")

R-squared: 0.9709519925772399
Mean Squared Error : 2652767.900415129


In [17]:
prediction = model.predict(x_test_scaled[4].reshape(1, -1))
print("Predicted value:", prediction)

Predicted value: [16576.47586324]


In [18]:
print("True value for instance :", y_test.iloc[4])

True value for instance : 16200.0


## Neural Netwok

In [74]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from keras.regularizers import l2

In [80]:
network = Sequential([
    Input(shape=(x_train.shape[1],)),
    Dense(units=100, activation="relu", kernel_regularizer=l2(0.01)),
    Dense(units=50, activation="relu", kernel_regularizer=l2(0.01)),
    Dense(units=1, activation="linear", kernel_regularizer=l2(0.01))
])

In [82]:
from tensorflow.keras.losses import MeanSquaredError

network.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),  
                loss='mean_squared_error',
                metrics=['mean_squared_error'])
network.summary()

In [94]:
network.fit(x_train_scaled,y_train,epochs = 10, batch_size=128)

Epoch 1/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 2341637.2500 - mean_squared_error: 2341532.2500
Epoch 2/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2339792.0000 - mean_squared_error: 2339682.5000
Epoch 3/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2493095.0000 - mean_squared_error: 2492982.5000
Epoch 4/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 2721688.2500 - mean_squared_error: 2721573.7500
Epoch 5/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 2379171.0000 - mean_squared_error: 2379055.7500
Epoch 6/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2472288.2500 - mean_squared_error: 2472168.0000
Epoch 7/10
[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 2348770.7500 - mean_squared_error: 23

<keras.src.callbacks.history.History at 0x2432c29db50>

In [96]:
# Evaluate the model on the test set
loss, mean_squared_error = network.evaluate(x_test_scaled, y_test, verbose=1)

print(f"Test Loss: {loss}")
print(f"Test Mean Squared Error: {mean_squared_error}")


[1m2953/2953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 2216383.2500 - mean_squared_error: 2216253.7500
Test Loss: 2200482.0
Test Mean Squared Error: 2200352.0


### Example

In [110]:
i = 15
prediction = model.predict(x_test_scaled[i].reshape(1, -1))
print("LR Predicted value:", prediction)
prediction2 = network.predict(x_test_scaled[i].reshape(1, -1))
print("NN Predicted value:", prediction2)
print("True value for instance :", y_test.iloc[i])

LR Predicted value: [21063.93718864]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
NN Predicted value: [[20741.264]]
True value for instance : 20800.0
