# Import Modules

In [204]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

# Data Collection

In [205]:
data = pd.read_csv('Cleaned_Car_data.csv')
data.shape

(816, 7)

In [206]:
data.head(820)

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...,...
811,811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [207]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  816 non-null    int64 
 1   name        816 non-null    object
 2   company     816 non-null    object
 3   year        816 non-null    int64 
 4   Price       816 non-null    int64 
 5   kms_driven  816 non-null    int64 
 6   fuel_type   816 non-null    object
dtypes: int64(4), object(3)
memory usage: 44.8+ KB


In [208]:
data.isnull().sum()

Unnamed: 0    0
name          0
company       0
year          0
Price         0
kms_driven    0
fuel_type     0
dtype: int64

In [209]:
list(data.select_dtypes(['object']).columns) # Data Type Object Columns in dataset

['name', 'company', 'fuel_type']

In [210]:
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0,816.0,,,,407.5,235.703203,0.0,203.75,407.5,611.25,815.0
name,816.0,254.0,Maruti Suzuki Swift,51.0,,,,,,,
company,816.0,25.0,Maruti,221.0,,,,,,,
year,816.0,,,,2012.444853,4.002992,1995.0,2010.0,2013.0,2015.0,2019.0
Price,816.0,,,,411717.615196,475184.422264,30000.0,175000.0,299999.0,491250.0,8500003.0
kms_driven,816.0,,,,46275.531863,34297.428044,0.0,27000.0,41000.0,56818.5,400000.0
fuel_type,816.0,3.0,Petrol,428.0,,,,,,,


In [211]:
data['company'].unique()

array(['Hyundai', 'Mahindra', 'Ford', 'Maruti', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'], dtype=object)

# Extracting Training Data

In [212]:
X=data[['name','company','year','kms_driven','fuel_type']]
y=data['Price']
X

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel
4,Ford Figo,Ford,2012,41000,Diesel
...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,50000,Petrol
812,Tata Indica V2,Tata,2009,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,132000,Petrol
814,Tata Zest XM,Tata,2018,27000,Diesel


In [213]:
print(X.shape, y.shape)

(816, 5) (816,)


# Split dataset into training set and test set

In [214]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2)

In [215]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

### Creating an OneHotEncoder object to contain all the possible categories

In [216]:
one=OneHotEncoder(handle_unknown = 'ignore')
one.fit(X[['name','company','fuel_type']])

OneHotEncoder(handle_unknown='ignore')

### Creating a column transformer to transform categorical columns

In [217]:
column_trans=make_column_transformer((OneHotEncoder(categories=one.categories_),['name','company','fuel_type']),
                                    remainder='passthrough')

In [218]:
lr = LinearRegression()

In [219]:
model = make_pipeline(column_trans,lr)

In [220]:
model.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat...
                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',
       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['Diesel', 'LPG', 'Pe

In [221]:
y_pred=model.predict(X_test)
y_pred

array([ 3.93033654e+04,  2.77547332e+05,  3.31568873e+05,  3.75078114e+05,
        2.92790400e+05,  2.56888143e+05,  5.39247447e+05,  2.56495913e+05,
        1.33501792e+06,  5.45429543e+05,  3.24392034e+05,  2.72413297e+05,
        5.36735264e+05,  6.08545742e+05,  6.05649330e+05,  2.61515866e+05,
        6.36198176e+05,  4.59572525e+05,  3.50443939e+05,  3.18902950e+05,
        4.23977259e+05,  4.20845660e+05,  2.93444396e+05,  3.49212467e+05,
        3.99621785e+05,  3.87452468e+05,  3.14493872e+05,  1.08636168e+05,
        2.37111094e+05,  2.90630865e+05,  5.72453952e+05,  2.41815428e+05,
        3.88771648e+05,  3.62478608e+05,  3.02537935e+05,  2.67796120e+05,
        3.26059122e+05,  9.48014509e+05,  5.91709423e+05,  6.31033018e+05,
        3.29077096e+05, -2.98406622e+04,  8.46594796e+04,  1.22305977e+06,
        4.19585478e+05,  1.98851657e+05, -3.00113193e+03,  4.20845660e+05,
        2.49820100e+05,  3.31526634e+05,  2.69454076e+05,  4.13791246e+05,
        2.18413638e+05,  

## Finding the model with a random state of TrainTestSplit where the model was found to give almost 0.92 as r2_score

In [222]:
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    model=make_pipeline(column_trans,lr)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [223]:
scores[np.argmax(scores)]

0.9210018258125283

In [224]:
input = pd.DataFrame(columns=X_test.columns,data=np.array(['Mahindra Quanto C8','Mahindra',2013,40000,'Diesel']).reshape(1,5))
model.predict(input)

array([353009.2356352])

In [225]:
import pickle

In [226]:
filename = 'car_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [227]:
loaded_model = pickle.load(open('car_model.sav', 'rb'))

In [228]:
for column in X.columns:
  print(column)

name
company
year
kms_driven
fuel_type
