In [406]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [407]:
df=pd.read_csv("./Car_Detail.csv",low_memory=False)
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [408]:
#  Data Exploration
print('Number of Rows: ', data.shape[0])
print('Number of Columns: ', data.shape[1], '\n')
print('Columns Names:', data.columns, '\n')
print('SubSet of Data:\n ', data.head().to_string(), '\n')
print('Data Information: '); print(data.info(), '\n')
print('Data Describe:\n ', data.describe(), '\n')

Number of Rows:  4340
Number of Columns:  10 

Columns Names: Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'car_maker', 'car_model'],
      dtype='object') 

SubSet of Data:
                         name  year  selling_price  km_driven    fuel seller_type transmission         owner car_maker car_model
0             Maruti 800 AC  2007          60000      70000  Petrol  Individual       Manual   First Owner    Maruti       800
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol  Individual       Manual   First Owner    Maruti     Wagon
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel  Individual       Manual   First Owner   Hyundai     Verna
3    Datsun RediGO T Option  2017         250000      46000  Petrol  Individual       Manual   First Owner    Datsun    RediGO
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel  Individual       Manual  Second Owner     Honda     Amaze

In [409]:
df.shape

(4340, 8)

In [410]:
# Top ten columns with missing values %
missing = df.count()/len(df)
missing = (1-missing)*100
missing.sort_values(ascending=False).head(10)

owner            0.0
transmission     0.0
seller_type      0.0
fuel             0.0
km_driven        0.0
selling_price    0.0
year             0.0
name             0.0
dtype: float64

In [411]:
# Check for duplicates
print(df.duplicated().any())
duplicated = df.duplicated()
print('Number of duplicated data: ', duplicated[duplicated == True].size)

True
Number of duplicated data:  763


In [412]:
print('Categorical Data: ')
print('Fuel unique values: ', df['fuel'].unique())
print('Number of unique values: ', df['fuel'].unique().size)
print('Seller type unique values: ', df['seller_type'].unique())
print('Number of unique values: ', df['seller_type'].unique().size)
print('Transmission unique values: ', df['transmission'].unique())
print('Number of unique values: ', df['transmission'].unique().size)
print('Owner unique values: ', df['owner'].unique())
print('Number of unique values: ', df['owner'].unique().size)

Categorical Data: 
Fuel unique values:  ['Petrol' 'Diesel' 'CNG' 'LPG' 'Electric']
Number of unique values:  5
Seller type unique values:  ['Individual' 'Dealer' 'Trustmark Dealer']
Number of unique values:  3
Transmission unique values:  ['Manual' 'Automatic']
Number of unique values:  2
Owner unique values:  ['First Owner' 'Second Owner' 'Fourth & Above Owner' 'Third Owner'
 'Test Drive Car']
Number of unique values:  5


In [413]:
# Unique name cars
print('Unique name of Cars: ', df['name'].unique().size)

Unique name of Cars:  1491


In [414]:
# name splitting
name = df['name'].str.split(' ', expand=True)
df['car_maker'] = name[0]
df['car_model'] = name[1]
print('Data After Splitting car Name')
print(df.head().to_string())

Data After Splitting car Name
                       name  year  selling_price  km_driven    fuel seller_type transmission         owner car_maker car_model
0             Maruti 800 AC  2007          60000      70000  Petrol  Individual       Manual   First Owner    Maruti       800
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol  Individual       Manual   First Owner    Maruti     Wagon
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel  Individual       Manual   First Owner   Hyundai     Verna
3    Datsun RediGO T Option  2017         250000      46000  Petrol  Individual       Manual   First Owner    Datsun    RediGO
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel  Individual       Manual  Second Owner     Honda     Amaze


In [415]:
data=df.copy()

In [416]:
df['fuel'].replace(['Petrol', 'Diesel', 'LPG', 'CNG', 'Electric'], [1, 2, 3, 4, 5], inplace=True)
df['seller_type'].replace(['Individual', 'Dealer', 'Trustmark Dealer'], [1, 2, 3], inplace=True)
df['transmission'].replace(['Manual', 'Automatic'], [0, 1], inplace=True)
df['owner'].replace(['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car'],
                      [1, 2, 3, 4, 0], inplace=True)
df['number_of_years'] = 2023 - df['year']
print('**********************************************************************************')
print('Data Types after Replacement:\n ', df.dtypes)
print(df.head().to_string())

**********************************************************************************
Data Types after Replacement:
  name               object
year                int64
selling_price       int64
km_driven           int64
fuel                int64
seller_type         int64
transmission        int64
owner               int64
car_maker          object
car_model          object
number_of_years     int64
dtype: object
                       name  year  selling_price  km_driven  fuel  seller_type  transmission  owner car_maker car_model  number_of_years
0             Maruti 800 AC  2007          60000      70000     1            1             0      1    Maruti       800               16
1  Maruti Wagon R LXI Minor  2007         135000      50000     1            1             0      1    Maruti     Wagon               16
2      Hyundai Verna 1.6 SX  2012         600000     100000     2            1             0      1   Hyundai     Verna               11
3    Datsun RediGO T Option  2017    

In [417]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,car_maker,car_model,number_of_years
0,Maruti 800 AC,2007,60000,70000,1,1,0,1,Maruti,800,16
1,Maruti Wagon R LXI Minor,2007,135000,50000,1,1,0,1,Maruti,Wagon,16
2,Hyundai Verna 1.6 SX,2012,600000,100000,2,1,0,1,Hyundai,Verna,11
3,Datsun RediGO T Option,2017,250000,46000,1,1,0,1,Datsun,RediGO,6
4,Honda Amaze VX i-DTEC,2014,450000,141000,2,1,0,2,Honda,Amaze,9


In [418]:
df['Current_Year']=2023  #Adding the Current_Year in dataset

df['No_of_total_years']=df['Current_Year']-df['year'] 
df.drop(['year','Current_Year'],axis=1,inplace=True)

df.head()

Unnamed: 0,name,selling_price,km_driven,fuel,seller_type,transmission,owner,car_maker,car_model,number_of_years,No_of_total_years
0,Maruti 800 AC,60000,70000,1,1,0,1,Maruti,800,16,16
1,Maruti Wagon R LXI Minor,135000,50000,1,1,0,1,Maruti,Wagon,16,16
2,Hyundai Verna 1.6 SX,600000,100000,2,1,0,1,Hyundai,Verna,11,11
3,Datsun RediGO T Option,250000,46000,1,1,0,1,Datsun,RediGO,6,6
4,Honda Amaze VX i-DTEC,450000,141000,2,1,0,2,Honda,Amaze,9,9


In [419]:
df.dtypes

name                 object
selling_price         int64
km_driven             int64
fuel                  int64
seller_type           int64
transmission          int64
owner                 int64
car_maker            object
car_model            object
number_of_years       int64
No_of_total_years     int64
dtype: object

In [420]:
df.drop('name',axis=1,inplace=True)

In [421]:
df=pd.get_dummies(df,drop_first=True) #drop_first drops the first feature 

In [432]:
df.dtypes

selling_price        int64
km_driven            int64
fuel                 int64
seller_type          int64
transmission         int64
                     ...  
car_model_Zen        uint8
car_model_Zest       uint8
car_model_i10        uint8
car_model_i20        uint8
car_model_redi-GO    uint8
Length: 220, dtype: object

In [422]:
x=df.drop('selling_price',axis=1)
y=df['selling_price']
print(x.shape)
print(type(x))
print(y.shape)
print(type(y))

(4340, 219)
<class 'pandas.core.frame.DataFrame'>
(4340,)
<class 'pandas.core.series.Series'>


In [423]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [424]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [425]:
def eval_reg_metrix(ytest,ypred):
    mae = mean_absolute_error(ytest, ypred)

    mse = mean_squared_error(ytest, ypred)

    r2 = r2_score(ytest, ypred)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R2):", r2)

def train_test_score(model):
    print('Training score:', model.score(x_train, y_train))
    print('Testing score:', model.score(x_test, y_test))


In [426]:
m1=LinearRegression()
m1.fit(x_train,y_train)

LinearRegression()

In [427]:
train_test_score(m1)

Training score: 0.8996176322052579
Testing score: 0.860165775148753


In [428]:
y_pred_train=m1.predict(x_train)
print(y_pred_train)

[323033.46271737 797164.32822323  93808.14242933 ... 198256.75851161
 451472.52243325 283115.90338353]


In [429]:
y_pred=m1.predict(x_test)
print(y_pred)

[ 6.62397777e+05  1.60061289e+06  1.76165435e+05  4.95000000e+06
  7.56357446e+04  2.42730284e+05  4.95423176e+05  2.95921896e+05
  1.05416509e+06  3.77428927e+05  4.80285832e+05  7.41046676e+05
  7.56324303e+05  2.92612801e+05  1.33665462e+05  6.10309141e+05
  4.37563986e+05  3.30263225e+05  1.36682762e+05  7.20736973e+05
 -1.64340337e+05  3.91176158e+05  3.76942406e+05  3.31680322e+05
  4.17899889e+05  5.78603103e+05  5.99013394e+05  7.12669771e+05
  5.38567391e+05  3.01749549e+05  5.86171705e+05  3.63935807e+05
  9.13884953e+05  7.36172303e+05  2.19491126e+05  2.03933670e+05
  1.72824659e+05  1.92002296e+05  1.34979538e+05  5.33716729e+05
  2.18202065e+05  6.15984024e+05  4.17342067e+05  1.70474799e+06
  5.67179380e+05  4.17438516e+05 -3.66205126e+04  2.93063976e+05
  9.26953783e+05  1.50578838e+05  3.98238729e+04  4.58600444e+05
  5.48269385e+05  1.06004168e+06  5.82298970e+05  4.58579226e+05
  4.95000000e+06  3.01861425e+05  5.38137450e+05  3.24522703e+05
  2.80153088e+05  1.15425

In [430]:
eval_reg_metrix(y_test,y_pred)

Mean Absolute Error (MAE): 121923.43428997787
Mean Squared Error (MSE): 51927607581.71311
R-squared (R2): 0.860165775148753


In [431]:
scores=f'''
{'metric'.ljust(20)}{'train'.center(20)}{'test'.center(20)}
{'r2_score'.ljust(20)}{r2_score(y_train,m1.predict(x_train))}\t{r2_score(y_test,m1.predict(x_test))}
{'mean_squared_error'.ljust(20)}{mean_squared_error(y_train,m1.predict(x_train))}\t{mean_squared_error(y_test,m1.predict(x_test))}'''

print(scores)


metric                     train                test        
r2_score            0.8996176322052579	0.860165775148753
mean_squared_error  32651893013.746338	51927607581.71311
