# **CAR PRICE PREDICTION PROJECT**

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
# importing dataset
car = pd.read_csv('quikr_car.csv')

In [3]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [4]:
car.shape

(892, 6)

In [5]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


### Time to clean the data

## Quality of data
- year has many rubbish values
- year is in object instead of being in int
- price also is in object, and some unwanted values
- same problem with kms driven column
- fuel type has some nan values
- keep first three words of name

### Cleaning

In [6]:
backup = car.copy()

#### **Improving year column**

In [7]:
car = car[car['year'].str.isnumeric()]

In [8]:
car['year'] = car['year'].astype(int)

car.head()

### **Cleaning Price column**

In [9]:
car =  car[car['Price']!='Ask For Price']

In [10]:
car['Price']= car['Price'].apply(lambda x: x.replace(",","")).astype(int)

### **Cleaning kms_driven column**

In [11]:
car['kms_driven']=car['kms_driven'].apply(lambda x:x.split(" ")[0])
car['kms_driven']= car['kms_driven'].apply(lambda x: x.replace(",",""))

In [12]:
car = car[car['kms_driven'].str.isnumeric()]

In [13]:
car['kms_driven']=car['kms_driven'].astype(int)

In [14]:
car = car[~car['fuel_type'].isna()]

In [15]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


### **Now taking only first three words of name column**

In [16]:
car['name']= car['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [17]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [18]:
car = car.reset_index(drop = True)

In [19]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [20]:
car.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [21]:
# removing outlier
car = car[car['Price']<6e6].reset_index(drop = True)

In [22]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [23]:
car.to_csv("Cleaned Car.csv")

## **Model building**

In [30]:
X = car.drop(columns = "Price")
y = car['Price']  

In [29]:
# importing sklearn library
from sklearn.model_selection import train_test_split

In [76]:
# random state is calculated by fiiting the model on different values 1000 times
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 686)

In [77]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [78]:
ohe = OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [79]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['name','company','fuel_type']),remainder = 'passthrough')

In [80]:
lr = LinearRegression()

In [81]:
pipe = make_pipeline(column_trans,lr)

In [82]:
pipe.fit(X_train,y_train)

In [83]:
y_pred = pipe.predict(X_test)

In [84]:
r2_score(y_test,y_pred)

0.831409976163777

### **Following code is for streamlit**

In [85]:
# Now dumping through picle
import pickle

In [86]:
pickle.dump(pipe,open('Car_price_prediction.pkl','wb'))

In [88]:
car_dict = car.to_dict()

In [89]:
pickle.dump(car_dict,open('Car_dict.pkl','wb'))