## Import necessary Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
car = pd.read_csv("quikr_car.csv")

In [3]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [4]:
car.shape

(892, 6)

In [5]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [6]:
car["year"].unique() #The year column is with mixed dtypes

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '...', '150k', 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs',
       'sale', '1995', 'ara)', '2002', 'SELL', '2001', 'tion', 'odel',
       '2 bs', 'arry', 'Eon', 'o...', 'ture', 'emi', 'car', 'able', 'no.',
       'd...', 'SALE', 'digo', 'sell', 'd Ex', 'n...', 'e...', 'D...',
       ', Ac', 'go .', 'k...', 'o c4', 'zire', 'cent', 'Sumo', 'cab',
       't xe', 'EV2', 'r...', 'zest'], dtype=object)

In [7]:
car["Price"].unique() #3rd entry is in Object data type which need to be removed and all entries should be in int.

array(['80,000', '4,25,000', 'Ask For Price', '3,25,000', '5,75,000',
       '1,75,000', '1,90,000', '8,30,000', '2,50,000', '1,82,000',
       '3,15,000', '4,15,000', '3,20,000', '10,00,000', '5,00,000',
       '3,50,000', '1,60,000', '3,10,000', '75,000', '1,00,000',
       '2,90,000', '95,000', '1,80,000', '3,85,000', '1,05,000',
       '6,50,000', '6,89,999', '4,48,000', '5,49,000', '5,01,000',
       '4,89,999', '2,80,000', '3,49,999', '2,84,999', '3,45,000',
       '4,99,999', '2,35,000', '2,49,999', '14,75,000', '3,95,000',
       '2,20,000', '1,70,000', '85,000', '2,00,000', '5,70,000',
       '1,10,000', '4,48,999', '18,91,111', '1,59,500', '3,44,999',
       '4,49,999', '8,65,000', '6,99,000', '3,75,000', '2,24,999',
       '12,00,000', '1,95,000', '3,51,000', '2,40,000', '90,000',
       '1,55,000', '6,00,000', '1,89,500', '2,10,000', '3,90,000',
       '1,35,000', '16,00,000', '7,01,000', '2,65,000', '5,25,000',
       '3,72,000', '6,35,000', '5,50,000', '4,85,000', '3,29,5

In [8]:
car["kms_driven"].unique()

array(['45,000 kms', '40 kms', '22,000 kms', '28,000 kms', '36,000 kms',
       '59,000 kms', '41,000 kms', '25,000 kms', '24,530 kms',
       '60,000 kms', '30,000 kms', '32,000 kms', '48,660 kms',
       '4,000 kms', '16,934 kms', '43,000 kms', '35,550 kms',
       '39,522 kms', '39,000 kms', '55,000 kms', '72,000 kms',
       '15,975 kms', '70,000 kms', '23,452 kms', '35,522 kms',
       '48,508 kms', '15,487 kms', '82,000 kms', '20,000 kms',
       '68,000 kms', '38,000 kms', '27,000 kms', '33,000 kms',
       '46,000 kms', '16,000 kms', '47,000 kms', '35,000 kms',
       '30,874 kms', '15,000 kms', '29,685 kms', '1,30,000 kms',
       '19,000 kms', nan, '54,000 kms', '13,000 kms', '38,200 kms',
       '50,000 kms', '13,500 kms', '3,600 kms', '45,863 kms',
       '60,500 kms', '12,500 kms', '18,000 kms', '13,349 kms',
       '29,000 kms', '44,000 kms', '42,000 kms', '14,000 kms',
       '49,000 kms', '36,200 kms', '51,000 kms', '1,04,000 kms',
       '33,333 kms', '33,600 kms', '5,

In [9]:
car["kms_driven"].isna().sum()

52

In [10]:
car["fuel_type"].unique()

array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)

### Quality 
1. Year has many Non-year values and year is in object dtype not in integer.
2. In Price there is one object value so we need to remove that and all entries are in string which should be in integer and there are commas also in price which should also be removed.
3. In Kms_driven there is "kms" in string and commas should be removed. plus need to change object dtype to integer data type.
4. There are 52 null values in kms_driven which should be handaled.
5. Fuel type has nan values.
6. Name is complicated so I will keep first 3 words of name.

### Data Cleaning 

In [11]:
#For Data security I am creating a data copy.
Backup = car.copy()

In [12]:
#1. first problem in data is :- Year has many Non-year values
car = car[car["year"].str.isnumeric()] #on String .str data we took the numeric values only by .isnumeric()

In [13]:
#2. year is in object dtype not in integer.
car["year"] = car["year"].astype(int) #data is converted to int with help of .astype

In [14]:
#3. Price has Ask for Price 
car = car[car["Price"]!="Ask For Price"] # Now we have only those entries in which "Ask for price" is not there.

In [15]:
#4. There are ' , ' in the Price.
car["Price"] = car["Price"].str.replace(',', '').astype(int)

In [16]:
#5. kms_driven has kms and there are '','' with kms.
car["kms_driven"] = car["kms_driven"].str.split( " " ).str.get(0).str.replace(',', '') 
#first we split the feature data from space. then i kept 0 index data. 
#then i replaced ',' with ' '

In [17]:
#6. kms_driven has some object data
car = car[car["kms_driven"]!= "Petrol"]

In [18]:
car["kms_driven"] = car["kms_driven"].astype(int)

In [19]:
#7. As there was one NaN value in fuel type by ~ that row is not showing 
car = car[~car["fuel_type"].isna()]

In [20]:
#8. Name is very confusing and mixed dtype so i am keeping only first 3 values of the name
car["name"] = car["name"].str.split(" ").str.slice(0,3).str.join( " ") 
#first split the name with space by .str.split()
# slice the values by .str.slice()
#then keep the first 3 values

In [21]:
#The index got currepted after cleaning so Reset_index and removed the old index values by drop=True
car = car.reset_index(drop=True)

In [22]:
#the car price must have outliers as the max price and upper quartile has too much difference.
car.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [23]:
car[car["Price"]>6e5] #There is 1 car which is more than 60million

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
6,Ford EcoSport Ambiente,Ford,2016,830000,24530,Diesel
14,Audi A8,Audi,2017,1000000,4000,Petrol
33,Toyota Innova 2.0,Toyota,2012,650000,82000,Diesel
34,Renault Lodgy 85,Renault,2018,689999,20000,Diesel
47,Mitsubishi Pajero Sport,Mitsubishi,2015,1475000,47000,Diesel
...,...,...,...,...,...,...
763,Mahindra Scorpio VLX,Mahindra,2014,650000,77000,Diesel
764,Toyota Innova 2.5,Toyota,2012,750000,75000,Diesel
771,Ford Endeavor 4x4,Ford,2019,2900000,9000,Diesel
777,Toyota Innova 2.5,Toyota,2011,750000,75000,Diesel


In [24]:
print(car['Price'].max())
print(car["Price"].min())
print(car["Price"].median())
print(car["Price"].mean())
print(car["Price"].mode())

8500003
30000
299999.0
411717.61519607843
0    250000
dtype: int32


In [25]:
car = car[car["Price"]<6e6].reset_index(drop= True)

In [26]:
#Save the cleaned data in CSV file 
car.to_csv("Cleaned Quikr Car data.csv")

#### Seperate Input and Output

In [27]:
X = car.drop(columns= "Price")
y = car["Price"]

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2)

In [29]:
one_hot_encoder = OneHotEncoder()

In [30]:
one_hot_encoder.fit(X[["name", "company","fuel_type"]])

In [31]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [32]:
columns_trans = make_column_transformer((OneHotEncoder(categories=one_hot_encoder.categories_),["name", "company","fuel_type"]),remainder= "passthrough")

In [33]:
linear_model = LinearRegression()

In [34]:
pipe = make_pipeline(columns_trans, linear_model)

In [35]:
pipe.fit(X_train,y_train)

In [36]:
y_pred = pipe.predict(X_test)

In [37]:
r2_score(y_test,y_pred)

0.5015570684511671

In [38]:
score = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state= i)
    linear_model= LinearRegression()
    pipe = make_pipeline(columns_trans, linear_model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(r2_score(y_test, y_pred), i)
    score.append(r2_score(y_test, y_pred))

0.7851453890306913 0
0.5132915312204238 1
0.6892725354000624 2
0.5995074944831891 3
0.8538385246570933 4
0.7471260027168454 5
0.6257480497733543 6
0.6859785458487092 7
0.7646070531666505 8
0.7310462860357275 9
0.7654765503858998 10
0.7341368163790868 11
0.5285834489676088 12
0.7808525437322886 13
0.7117950686830049 14
0.7415433943152754 15
0.518265788641995 16
0.7431801380636769 17
0.6922370979681447 18
0.6663243435300658 19
0.6627255577226918 20
0.6596918037505194 21
0.7959210157879429 22
0.6949705063229092 23
0.6238150045940181 24
0.5546159843674505 25
0.7016045504220352 26
0.8065710040237826 27
0.8203292146650685 28
0.5884485790500247 29
0.6060994654641244 30
0.6548877850685921 31
0.8020722175542976 32
0.819929347848787 33
0.5838847666489511 34
0.6876942993533651 35
0.6410199017674003 36
0.7368514998789212 37
0.7158048369498127 38
0.7967762282963207 39
0.3874238122753544 40
0.6735140333800669 41
0.642779520704603 42
0.7471072730423655 43
0.7504463752723509 44
0.7804008860194027 45
0

0.7923180749530958 369
0.7070913563475144 370
0.662657441935435 371
0.6810187903485755 372
0.706811586609255 373
0.5696166573926957 374
0.6745199676087248 375
0.734323958473701 376
0.657477682941859 377
0.794417070027259 378
0.7977718512498806 379
0.6499484254799506 380
0.7266728521681854 381
0.7800110730003508 382
0.8318933374337194 383
0.735884335256443 384
0.7926138428612797 385
0.7496064283424516 386
0.7755831849461698 387
0.5684336805692514 388
0.5892118189458166 389
0.7773472673215964 390
0.6225570553547541 391
0.8540875571489192 392
0.4429790006652613 393
0.696375824351753 394
0.7724095464508001 395
0.655347033992802 396
0.6801500783375971 397
0.6851389569347655 398
0.742251329238055 399
0.7175837428564928 400
0.6823496455117131 401
0.5332809363800869 402
0.48212360383695996 403
0.6798357050061195 404
0.6617649505130846 405
0.5517690657331147 406
0.7064504313299805 407
0.7733363009053829 408
0.7249508238342021 409
0.7768109489446889 410
0.675406845939017 411
0.761742414777364 41

0.6707596849676488 729
0.7458636665563099 730
0.5433205683759114 731
0.7930132562899577 732
0.7274352115883798 733
0.6333186826454318 734
0.5658053491110646 735
0.6443767027564705 736
0.5817087744360492 737
0.7517834080574579 738
0.6307134563151091 739
0.7357334033911005 740
0.6543756888620519 741
0.7940260908626962 742
0.7816052036329211 743
0.5806216386005132 744
0.6718501012081568 745
0.7537127734399642 746
0.5620691433785452 747
0.6342831062738941 748
0.5603178785045919 749
0.6011362860962559 750
0.7435332735978155 751
0.761690025482664 752
0.8062566068454544 753
0.8110391131436591 754
0.6359483851002365 755
0.7581442463064143 756
0.7789656524827451 757
0.5856868407140302 758
0.6881000152242465 759
0.7229543986055715 760
0.7564878979272142 761
0.746045910960323 762
0.6601494016645937 763
0.6284083032278097 764
0.705879832073135 765
0.690836804310306 766
0.6877501719795671 767
0.5380967303307058 768
0.7794772631135953 769
0.6982009518831176 770
0.7037738617862646 771
0.7992805680171

In [39]:
np.argmax(score)

661

In [40]:
score[np.argmax(score)]

0.8900345063168427

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state= np.argmax(score))
linear_model= LinearRegression()
pipe = make_pipeline(columns_trans, linear_model)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test, y_pred), i)

0.8900345063168427 999


### Model Evaluation

In [42]:
import pickle

In [43]:
pickle.dump(pipe, open("LinearRegressionModel.pkl", "wb"))

### Let's predict one row

In [44]:
pipe.predict(pd.DataFrame([["Maruti Suzuki Swift","Maruti", 2019, 100, "Petrol"]], columns= ["name", "company", "year","kms_driven", "fuel_type"] ))

array([401379.05200387])