# Column name explanation

Dataset - https://www.kaggle.com/datasets/milanvaddoriya/old-car-price-prediction

Column names are self explanatory.<br>
Some steps to talk are:
- understand our problem statement, predict the price of a car based on provided metrics
- pre process, clean data, split data, feature selection.
- create ML model, test it using different metrics.


In [167]:
#my version
import tensorflow as tf
from tensorflow import keras
#layers for NN
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
#oretrained model for transfer learning
from keras.models import Model
from keras.applications import vgg19

import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import datasets



In [168]:
df = pd.read_csv('car_price.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats


In [169]:
#remove the first col literally called unnamed
df = df.drop(df.columns[0], axis=1)

# check for nulls
df.info() 

df.shape #5512 rows 9 cols


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5512 entries, 0 to 5511
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   car_name             5512 non-null   object
 1   car_prices_in_rupee  5512 non-null   object
 2   kms_driven           5512 non-null   object
 3   fuel_type            5512 non-null   object
 4   transmission         5512 non-null   object
 5   ownership            5512 non-null   object
 6   manufacture          5512 non-null   int64 
 7   engine               5512 non-null   object
 8   Seats                5512 non-null   object
dtypes: int64(1), object(8)
memory usage: 387.7+ KB


(5512, 9)

In [170]:
df.isnull().sum() #double checking no nulls

car_name               0
car_prices_in_rupee    0
kms_driven             0
fuel_type              0
transmission           0
ownership              0
manufacture            0
engine                 0
Seats                  0
dtype: int64

In [171]:
df.duplicated().sum() #115 duplicated rows
df.drop_duplicates()

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


In [172]:
df['ownership'].value_counts()

1st Owner    3736
2nd Owner    1314
3rd Owner     359
4th Owner      84
5th Owner      12
0th Owner       7
Name: ownership, dtype: int64

In [173]:
# rename engine column to include CC to avoid confusion
df = df.rename(columns={'engine':'engine (cc)'})
df

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine (cc),Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7 Seats
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5 Seats
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5 Seats
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6 Seats


In [174]:
# lambda to remove the ' Seats' from the attribute
df['Seats'] = df['Seats'].apply(lambda x: x.rstrip(' Seats'))
df

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine (cc),Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5
...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,28.90 Lakh,"45,000 kms",Diesel,Automatic,1st Owner,2018,2995 cc,7
5508,BMW M Series M4 Coupe,64.90 Lakh,"29,000 kms",Petrol,Automatic,2nd Owner,2015,1968 cc,5
5509,Jaguar XF 2.2 Litre Luxury,13.75 Lakh,"90,000 kms",Diesel,Automatic,2nd Owner,2013,2755 cc,5
5510,BMW 7 Series 730Ld,29.90 Lakh,"79,000 kms",Diesel,Automatic,3rd Owner,2015,2967 cc,6


In [175]:
# make new column called currency for our two currencies, Lakh and Crore
# 1 Crore = 100 Lakh
# 100,000 Lakh = 1,217.07 USD

df.loc[df['car_prices_in_rupee'].str.contains('Lakh'), 'Currency'] = 'Lakh'
df.loc[df['car_prices_in_rupee'].str.contains('Crore'), 'Currency'] = 'Crore'

In [176]:
# one row (240) has "99,999" in the car price. Remove since d/n have Lakh or Crore
df=df.dropna()

In [177]:
# Continue strip of the columns. Dont need the CC in 1500 'cc' in the engine col
# We want some of our cols to turn into ints
df['engine (cc)'] = df['engine (cc)'].apply(lambda x: x.rstrip(' cc'))
df['ownership'] = df['ownership'].apply(lambda x: x.rstrip(' th Owner'))
df['ownership'] = df['ownership'].apply(lambda x: x.rstrip(' rd Owner'))
df['ownership'] = df['ownership'].apply(lambda x: x.rstrip(' nd Owner'))
df['ownership'] = df['ownership'].apply(lambda x: x.rstrip(' st Owner'))
df['kms_driven'] = df['kms_driven'].apply(lambda x: x.rstrip(' kms'))

# some rows have , and . we can remove them now. Same with the Lakh and Crore words at the end
df['kms_driven'] = df['kms_driven'].apply(lambda x: x.replace(',', '')) #31,146 turn into int
df['car_prices_in_rupee'] = df['car_prices_in_rupee'].apply(lambda x: x.rstrip(' Lakh'))
df['car_prices_in_rupee'] = df['car_prices_in_rupee'].apply(lambda x: x.replace('.', ''))
df['car_prices_in_rupee'] = df['car_prices_in_rupee'].apply(lambda x: x.rstrip(' Crore'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['engine (cc)'] = df['engine (cc)'].apply(lambda x: x.rstrip(' cc'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ownership'] = df['ownership'].apply(lambda x: x.rstrip(' th Owner'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ownership'] = df['ownership'].apply(lambda x: x.rstrip(' 

In [178]:
# turn the car price col into a float
df['car_prices_in_rupee']=df['car_prices_in_rupee'].astype('float') 

# Right now our car price col went from 10.03 Lakh -> 1003
# If the row is Crore (100 times of 1 Lakh) multi it by 100
df['car_prices_in_rupee']=np.where(df['Currency'] == 'Crore',
                                           df['car_prices_in_rupee'] * 100,
                                           df['car_prices_in_rupee'])

# Now all rows are sort of in Lakh (still in that 10.03 state) multi by 100K now
df['car_prices_in_rupee']=df['car_prices_in_rupee']*100000
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['car_prices_in_rupee']=df['car_prices_in_rupee'].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['car_prices_in_rupee']=np.where(df['Currency'] == 'Crore',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['car_prices_in_rupee']=df['car_prices_in_rupee']*100000


Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine (cc),Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,100300000.0,86226,Diesel,Manual,1,2017,1956,5,Lakh
1,Renault Duster RXZ Turbo CVT,128300000.0,13248,Petrol,Automatic,1,2021,1330,5,Lakh
2,Toyota Camry 2.5 G,164000000.0,60343,Petrol,Automatic,1,2016,2494,5,Lakh
3,Honda Jazz VX CVT,77700000.0,26696,Petrol,Automatic,1,2018,1199,5,Lakh
4,Volkswagen Polo 1.2 MPI Highline,51500000.0,69414,Petrol,Manual,1,2016,1199,5,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,289000000.0,45000,Diesel,Automatic,1,2018,2995,7,Lakh
5508,BMW M Series M4 Coupe,649000000.0,29000,Petrol,Automatic,2,2015,1968,5,Lakh
5509,Jaguar XF 2.2 Litre Luxury,137500000.0,90000,Diesel,Automatic,2,2013,2755,5,Lakh
5510,BMW 7 Series 730Ld,299000000.0,79000,Diesel,Automatic,3,2015,2967,6,Lakh


In [179]:
# df=pd.get_dummies(df,columns='')

# Change the types of cols to ints for kms driven, ownership, engine, etc
df = df.astype({'kms_driven':'int64', 'ownership':'int64', 'engine (cc)': 'int64', 'Seats': 'int64'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5389 entries, 0 to 5511
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   car_name             5389 non-null   object 
 1   car_prices_in_rupee  5389 non-null   float64
 2   kms_driven           5389 non-null   int64  
 3   fuel_type            5389 non-null   object 
 4   transmission         5389 non-null   object 
 5   ownership            5389 non-null   int64  
 6   manufacture          5389 non-null   int64  
 7   engine (cc)          5389 non-null   int64  
 8   Seats                5389 non-null   int64  
 9   Currency             5389 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 463.1+ KB


In [180]:
df
# CURRENT ISSUE - Rupee price SHOULD be 1003000 but there are two extra zeros

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine (cc),Seats,Currency
0,Jeep Compass 2.0 Longitude Option BSIV,100300000.0,86226,Diesel,Manual,1,2017,1956,5,Lakh
1,Renault Duster RXZ Turbo CVT,128300000.0,13248,Petrol,Automatic,1,2021,1330,5,Lakh
2,Toyota Camry 2.5 G,164000000.0,60343,Petrol,Automatic,1,2016,2494,5,Lakh
3,Honda Jazz VX CVT,77700000.0,26696,Petrol,Automatic,1,2018,1199,5,Lakh
4,Volkswagen Polo 1.2 MPI Highline,51500000.0,69414,Petrol,Manual,1,2016,1199,5,Lakh
...,...,...,...,...,...,...,...,...,...,...
5507,BMW X1 sDrive 20d xLine,289000000.0,45000,Diesel,Automatic,1,2018,2995,7,Lakh
5508,BMW M Series M4 Coupe,649000000.0,29000,Petrol,Automatic,2,2015,1968,5,Lakh
5509,Jaguar XF 2.2 Litre Luxury,137500000.0,90000,Diesel,Automatic,2,2013,2755,5,Lakh
5510,BMW 7 Series 730Ld,299000000.0,79000,Diesel,Automatic,3,2015,2967,6,Lakh
