# Car Price Prediction

<hr>

**Problem Statement:**
- Consider there’s a client that specializes in trading used cars across different states in the Nigeria.
- As a Data Scientist, you are given the task of creating an automated system that predicts the selling price of cars based on various features (information) 
- Such as the car’s model name, manufacture year, the current price when bought new, kilometers driven, fuels type and owners it had.

In [2]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [9]:
# import the data

data = pd.read_csv('../../data/car_data.csv')
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [11]:
# check for missing value

data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [12]:
# get statistical info
data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [13]:
len(data), data.shape

(301, (301, 9))

In [14]:
data.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [23]:
# data count for categorical data
data['Fuel_Type'].value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [24]:
# data count for categorical data
data['Transmission'].value_counts()

Transmission
Manual       261
Automatic     40
Name: count, dtype: int64

In [25]:
# data count for categorical data
data['Seller_Type'].value_counts()

Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64

## One-hot Encoding

In [32]:
# label encoding

from sklearn.preprocessing import LabelEncoder

In [33]:
encoder = LabelEncoder()

In [43]:
data['Fuel_Type'] = encoder.fit_transform(data['Fuel_Type'])
data['Transmission'] = encoder.fit_transform(data['Transmission'])
data['Seller_Type'] = encoder.fit_transform(data['Seller_Type'])

In [44]:
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,2,0,1,0
1,sx4,2013,4.75,9.54,43000,1,0,1,0
2,ciaz,2017,7.25,9.85,6900,2,0,1,0
3,wagon r,2011,2.85,4.15,5200,2,0,1,0
4,swift,2014,4.6,6.87,42450,1,0,1,0


## Split data

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X = data.drop(['Car_Name','Selling_Price'], axis=1)
y = data['Selling_Price']

X

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,5.59,27000,2,0,1,0
1,2013,9.54,43000,1,0,1,0
2,2017,9.85,6900,2,0,1,0
3,2011,4.15,5200,2,0,1,0
4,2014,6.87,42450,1,0,1,0
...,...,...,...,...,...,...,...
296,2016,11.60,33988,1,0,1,0
297,2015,5.90,60000,2,0,1,0
298,2009,11.00,87934,2,0,1,0
299,2017,12.50,9000,1,0,1,0


**Features Scaling**

In [47]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scale = MinMaxScaler()

In [49]:
X = scale.fit_transform(X)
X

array([[0.73333333, 0.0571088 , 0.05305305, ..., 0.        , 1.        ,
        0.        ],
       [0.66666667, 0.09991331, 0.08508509, ..., 0.        , 1.        ,
        0.        ],
       [0.93333333, 0.10327265, 0.01281281, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.4       , 0.11573472, 0.17504304, ..., 0.        , 1.        ,
        0.        ],
       [0.93333333, 0.1319896 , 0.01701702, ..., 0.        , 1.        ,
        0.        ],
       [0.86666667, 0.06046814, 0.00993794, ..., 0.        , 1.        ,
        0.        ]])

In [50]:
y

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 301, dtype: float64

In [51]:
# train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)


In [52]:
X_train.shape

(240, 7)

## ANN Algorithm

In [72]:
tf.random.set_seed(42)
# 1. create a model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(7, input_shape=(7,)),
    tf.keras.layers.Dense(1)
])

# 2. compile model
model.compile(loss="mae", optimizer='sgd', metrics=['mae'])

# 3. Fit the data
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x2849c983730>

In [73]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 7)                 56        
                                                                 
 dense_4 (Dense)             (None, 1)                 8         
                                                                 
Total params: 64 (256.00 Byte)
Trainable params: 64 (256.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [74]:
y_pred = model.predict(X_test)



In [75]:
y_pred

array([[0.9920654 ],
       [5.5089397 ],
       [4.594231  ],
       [0.04919904],
       [5.3001122 ],
       [4.9428926 ],
       [0.7022909 ],
       [0.53212845],
       [0.78888106],
       [4.845923  ],
       [5.297841  ],
       [0.45444015],
       [5.4026165 ],
       [1.0874687 ],
       [4.702056  ],
       [4.95923   ],
       [0.30317113],
       [5.772412  ],
       [0.9101463 ],
       [0.93507254],
       [0.40480074],
       [5.052738  ],
       [4.55548   ],
       [4.6419663 ],
       [0.558938  ],
       [4.841021  ],
       [5.085284  ],
       [4.693859  ],
       [0.93894076],
       [0.8207364 ],
       [0.41195992],
       [4.9818873 ],
       [0.04534194],
       [4.367766  ],
       [5.123747  ],
       [5.1044154 ],
       [5.2374315 ],
       [4.815611  ],
       [4.9353876 ],
       [4.925721  ],
       [4.1842256 ],
       [4.291062  ],
       [5.065605  ],
       [0.5778161 ],
       [4.921901  ],
       [0.4456558 ],
       [5.2920947 ],
       [5.273

In [76]:
y_test

177     0.35
289    10.11
228     4.95
198     0.15
60      6.95
       ...  
234     5.50
296     9.50
281     2.10
285     7.40
182     0.30
Name: Selling_Price, Length: 61, dtype: float64

In [77]:
model.evaluate(X_test,y_test)



[1.987656593322754, 1.987656593322754]

**Mean Absolute Error**

In [78]:
y_test.shape, y_pred.shape

((61,), (61, 1))

In [79]:
y_pred = tf.squeeze(y_pred)
y_pred.shape

TensorShape([61])

In [80]:
mae = tf.keras.losses.mae(y_test, y_pred)
mae

<tf.Tensor: shape=(), dtype=float32, numpy=1.9876565>

> The lower the mae the better the model. **MAE** must be closer to zero.

### Improve the model

In [82]:
tf.random.set_seed(42)
# 1. create a model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(7, input_shape=(7,)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(1)
])

# 2. compile model
model.compile(loss="mae", 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), 
              metrics=['mae'])

# 3. Fit the data
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x284a60e5160>

In [83]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 7)                 56        
                                                                 
 dense_6 (Dense)             (None, 100)               800       
                                                                 
 dense_7 (Dense)             (None, 1)                 101       
                                                                 
Total params: 957 (3.74 KB)
Trainable params: 957 (3.74 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [84]:
y_pred = model.predict(X_test)
y_pred



array([[ 0.267463  ],
       [10.546837  ],
       [ 4.9684253 ],
       [ 0.22358902],
       [ 8.623726  ],
       [ 6.592498  ],
       [ 0.6804806 ],
       [ 0.55192155],
       [ 0.6676194 ],
       [ 6.5709243 ],
       [11.333195  ],
       [ 0.49798554],
       [ 8.49929   ],
       [ 0.68631816],
       [ 5.5809007 ],
       [ 2.7999148 ],
       [ 0.542683  ],
       [14.510087  ],
       [ 0.8280621 ],
       [ 1.3214428 ],
       [ 0.46013835],
       [ 8.606966  ],
       [ 5.355559  ],
       [ 2.7501228 ],
       [ 0.5023169 ],
       [ 3.4804268 ],
       [ 4.833906  ],
       [ 2.7234411 ],
       [ 1.0620644 ],
       [ 0.8300549 ],
       [ 0.4584751 ],
       [ 9.301717  ],
       [ 0.4314876 ],
       [ 2.1120777 ],
       [ 9.019449  ],
       [ 4.255405  ],
       [ 6.3653045 ],
       [ 4.46108   ],
       [ 2.6750178 ],
       [ 5.5665812 ],
       [ 1.4940587 ],
       [-0.7421697 ],
       [ 4.149384  ],
       [ 0.43694344],
       [ 6.6026864 ],
       [ 0

In [85]:
model.evaluate(X_test, y_test)



[0.5717499256134033, 0.5717499256134033]

In [86]:
y_test.shape, y_pred.shape

((61,), (61, 1))

In [87]:
y_pred = tf.squeeze(y_pred)
y_pred.shape

TensorShape([61])

In [88]:
mae = tf.keras.losses.mae(y_test, y_pred)
mae

<tf.Tensor: shape=(), dtype=float32, numpy=0.57174987>