In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("quikr_car.csv")

In [3]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [4]:
df.shape

(892, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [6]:
# Clean the data 

In [7]:
df.fuel_type.unique()

array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)

# Quality 

- will first 3 letters of name 
- company has some numbers and some general words
- date also is very mess
- date we will remove nan, words
- Price has ask for price 
- Price has commas
- Price has more
- kms_driven has kms in th end and also need to change them to int
- fuel_type has nan values

# Cleaning

In [8]:
backup = df.copy()

In [9]:
df = df[df['year'].str.isnumeric()]

In [10]:
df.year = df.year.astype(int)

In [11]:
df = df[df.Price != "Ask For Price"]

In [12]:
df.Price = df.Price.str.replace(",", "").astype(int)

In [13]:
df.kms_driven = df["kms_driven"].str.split(" ").str.get(0).str.replace(",","")

In [14]:
df = df[df.kms_driven.str.isnumeric()]

In [15]:
df.kms_driven = df.kms_driven.astype(int)

In [16]:
df = df[~df.fuel_type.isna()]

In [17]:
df.name = df.name.str.split(" ").str.slice(0,3).str.join(" ")

In [19]:
df = df.reset_index(drop=True)

In [21]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [22]:
df = df[df['Price']<6e6]

In [29]:
df = df.reset_index(drop = True)

# Model

In [30]:
x = df.iloc[:, [0,1,2,4,5]]
y = df.iloc[:, 3]

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2)

In [35]:
from sklearn.linear_model import LinearRegression

In [36]:
from sklearn.metrics import r2_score

In [37]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
ohe = OneHotEncoder()

In [51]:
ohe.fit(x[['name', 'company', 'fuel_type']])

In [52]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [53]:
col_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['name', 'company', 'fuel_type']), remainder='passthrough')

In [54]:
lr = LinearRegression()

In [55]:
pipe = make_pipeline(col_trans, lr)

In [56]:
pipe.fit(x_train, y_train)

In [57]:
y_pred = pipe.predict(x_test)

In [59]:
r2_score(y_test, y_pred)

0.5228513473668908

In [64]:
scores = []
for i in range(1000):
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(col_trans, lr)
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    scores.append(r2_score(y_test, y_pred))

In [65]:
np.argmax(scores)

433

In [66]:
scores[np.argmax(scores)]

0.8456515104452564

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=np.argmax(scores))
lr = LinearRegression()
pipe = make_pipeline(col_trans, lr)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
r2_score(y_test, y_pred)

0.8456515104452564

In [69]:
import pickle

In [70]:
pickle.dump(pipe, open('build.pkl', 'wb'))

In [71]:
df.to_csv("Cleaned_car_data.csv")