In [1]:
import pandas as pd
import numpy as np

In [2]:
car = pd.read_csv('car_data.csv')

In [3]:
car

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...,...
887,Ta,Tara,zest,310000,,
888,Tata Zest XM Diesel,Tata,2018,260000,"27,000 kms",Diesel
889,Mahindra Quanto C8,Mahindra,2013,390000,"40,000 kms",Diesel
890,Honda Amaze 1.2 E i VTEC,Honda,2014,180000,Petrol,


In [4]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


## Quality
    year column has many non year values.
    year should be of int type.
    price has 'ask to price'.
    kms_drivan has kms with int.
    kms_driven has nan values.
    fuel_type has nan values.
    keep 1st 3 words of name.

## Cleaning

In [5]:
backup = car.copy()

In [6]:
car = car[car['year'].str.isnumeric()]

In [7]:
car['year']=car['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['year']=car['year'].astype(int)


In [8]:
car = car[car['Price'] != "Ask For Price"]


In [9]:
# removing 'comma' from car price
car['Price']=car['Price'].str.replace(',','')

In [10]:
# convert car['price'] type from string to int
car['Price'] = car['Price'].astype("int")

In [11]:
car['kms_driven'] = car['kms_driven'].str.replace(' kms','')

In [12]:
car['kms_driven'] = car['kms_driven'].str.replace(',','')

In [13]:
car = car[car['kms_driven'].str.isnumeric()]

In [14]:
# car['fuel_type'].isna() will not be able to return all the tuples that are nan. Hence 
# below notation is used
# '~' sign represents removal

car = car[~car['fuel_type'].isna()]

In [15]:
# keeping only first 3 values of name
car['name'] = car['name'].str.split(' ').str.slice(0,3).str.join(' ')


In [16]:
# after performing all the aboe operation, we will have unbalanced indexcs. ie. though we have
# 816 rows. indexs are writen as 886. it happens due to dropping of index
car = car.reset_index(drop=True)

In [17]:
# now we will check for outliers. 
car.describe()
# when we describe it. we can easily see that, 75% car has its price
# less then 4.9 lakh, and there are, one car has its price as 86 lalh.
# which is outlier. So we will check how many car has its price grater than
# 60 lakh

Unnamed: 0,year,Price
count,816.0,816.0
mean,2012.444853,411717.6
std,4.002992,475184.4
min,1995.0,30000.0
25%,2010.0,175000.0
50%,2013.0,299999.0
75%,2015.0,491250.0
max,2019.0,8500003.0


In [18]:
# here we are keeping only those cars whose price is less than 60lakh.
# because car=mahindra XUV is sold at 80 lakh. whoes original price is 
# around 13 Lakh. that's a outlier in data.
car = car[car['Price'] < 6e06].reset_index(drop=True)

In [19]:
# storing clean data in new .csv file
car.to_csv('clean_car_data.csv')

## Model

In [20]:
x = car.drop(columns="Price")
y = car['Price']

In [21]:
# splitting into train and test dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [23]:
ohe = OneHotEncoder()
ohe.fit(x[['name','company','fuel_type']])

OneHotEncoder()

In [24]:
# ohe.categories_

In [25]:
columns_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                        remainder='passthrough')

In [26]:
lr = LinearRegression()

In [27]:
pipe = make_pipeline(columns_trans,lr)

In [28]:
pipe.fit(x_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat...
                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',
       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['Diesel', 'LPG', 'Pe

In [29]:
y_pred=pipe.predict(x_test)

In [30]:
r2_score(y_test,y_pred)

0.5093388033356202

In [31]:
scores = []
for i in range(1000):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(columns_trans,lr)
    pipe.fit(x_train,y_train)
    y_pred = pipe.predict(x_test)
    scores.append(r2_score(y_test,y_pred))

In [32]:
scores[np.argmax(scores)]

0.8897737600934208

In [33]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=np.argmax(scores))
lr = LinearRegression()
pipe = make_pipeline(columns_trans,lr)
pipe.fit(x_train,y_train)
y_pred=pipe.predict(x_test)
r2_score(y_test,y_pred)


0.8897737600934208

In [34]:
import pickle

In [35]:
pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))

In [72]:
pipe.predict(pd.DataFrame([['Mahindra Jeep CL550','Mahindra',2022,100,'Diesel']], columns=['name','company','year','kms_driven','fuel_type']))

array([539138.92108862])

### Test Code For One Hot Encoding
Not a part of car price prediction

In [26]:
# for col in car.columns:
#     print(col,': ',len(car[col].unique()),' labels')

In [50]:
# let's examine how many columns we will obtain after 
# applying one hot encoding

# import pandas as pd 
# print('Shape dataset: ',car.shape)
# print("After applying One hot encoding shape: ")
# pd.get_dummies(car,drop_first=True).shape

In [41]:
# let's find the top 10 most frequent categories for the variable 'price'
# car.Price.value_counts().sort_values(ascending=False).head(20)

In [48]:
# let's make a list of top 10 most frequent categories of the variable 
# top_10 = [x for x in car.Price.value_counts().sort_values(ascending=False).head(10).index]