In [112]:
import numpy as np
import pandas as pd
import plotly.express as px

In [113]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


# Data Exploration

In [114]:
# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [115]:
# Check Summary statistics for numerical columns
df.describe().round(2)

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,5977.0,6019.0
mean,2013.36,58738.38,5.28,9.48
std,3.27,91268.84,0.81,11.19
min,1998.0,171.0,0.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.64
75%,2016.0,73000.0,5.0,9.95
max,2019.0,6500000.0,10.0,160.0


In [116]:
# Check Summary statistics for categorical columns
df.describe(include= 'object')

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,New_Price
count,6019,6019,6019,6019,6019,6017,5983,5983,824
unique,1876,11,5,2,4,442,146,372,540
top,Mahindra XUV500 W8 2WD,Mumbai,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,95.13 Lakh
freq,49,790,3205,4299,4929,172,606,235,6


In [117]:
# Check duplicates
df.duplicated().sum()

0

In [118]:
# Check missing values
df.isna().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [119]:
# Check missing values
df.isna().mean().round(4) * 100

Name                  0.00
Location              0.00
Year                  0.00
Kilometers_Driven     0.00
Fuel_Type             0.00
Transmission          0.00
Owner_Type            0.00
Mileage               0.03
Engine                0.60
Power                 0.60
Seats                 0.70
New_Price            86.31
Price                 0.00
dtype: float64

### Data Cleaning

In [120]:
# Drop New_Price column
df.drop('New_Price', axis= 1, inplace= True)

In [121]:
df.duplicated().sum()

0

In [122]:
# Check each categorical column in depth

cat_cols = df.select_dtypes(include= 'object').columns

for col in cat_cols:

    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print('-' * 100)

Name
1876
['Maruti Wagon R LXI CNG' 'Hyundai Creta 1.6 CRDi SX Option'
 'Honda Jazz V' ... 'Volkswagen Polo IPL II 1.2 Petrol Highline'
 'Tata Bolt Revotron XT' 'Mahindra Xylo D4 BSIV']
----------------------------------------------------------------------------------------------------
Location
11
['Mumbai' 'Pune' 'Chennai' 'Coimbatore' 'Hyderabad' 'Jaipur' 'Kochi'
 'Kolkata' 'Delhi' 'Bangalore' 'Ahmedabad']
----------------------------------------------------------------------------------------------------
Fuel_Type
5
['CNG' 'Diesel' 'Petrol' 'LPG' 'Electric']
----------------------------------------------------------------------------------------------------
Transmission
2
['Manual' 'Automatic']
----------------------------------------------------------------------------------------------------
Owner_Type
4
['First' 'Second' 'Fourth & Above' 'Third']
----------------------------------------------------------------------------------------------------
Mileage
442
['26.6 km/kg' '19.67 k

In [123]:
def clean_cols(x):

    if type(x) == float:
        return x

    elif 'null' in x:
        return np.nan

    else:
        return float(x.split()[0])

for col in ['Mileage', 'Engine', 'Power']:

    df[col] = df[col].apply(clean_cols)

In [124]:
df[['Mileage', 'Engine', 'Power']]

Unnamed: 0,Mileage,Engine,Power
0,26.60,998.0,58.16
1,19.67,1582.0,126.20
2,18.20,1199.0,88.70
3,20.77,1248.0,88.76
4,15.20,1968.0,140.80
...,...,...,...
6014,28.40,1248.0,74.00
6015,24.40,1120.0,71.00
6016,14.00,2498.0,112.00
6017,18.90,998.0,67.10


In [125]:
df.dropna(inplace= True, ignore_index= True)

In [126]:
df.isna().sum()

Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [127]:
# In depth check for numerical columns

for col in df.select_dtypes(include= 'number').columns:

    px.histogram(data_frame= df, x= col, title= col).show()

In [128]:
df[df.Kilometers_Driven > 1000000]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
2269,BMW X5 xDrive 30d M Sport,Chennai,2017,6500000,Diesel,Automatic,First,15.97,2993.0,258.0,5.0,65.0


In [129]:
drop_idx = df[df.Kilometers_Driven > 1000000].index
drop_idx

Index([2269], dtype='int64')

In [130]:
df.drop(drop_idx, inplace= True)

In [131]:
df.reset_index(inplace= True, drop= True)

In [132]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


# Feature Engineering

In [133]:
def extrat_brand(x):

    return x.split()[0]

df['brand'] = df.Name.apply(extrat_brand)
df['brand']

0          Maruti
1         Hyundai
2           Honda
3          Maruti
4            Audi
          ...    
5866       Maruti
5867      Hyundai
5868     Mahindra
5869       Maruti
5870    Chevrolet
Name: brand, Length: 5871, dtype: object

In [134]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,brand
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75,Maruti
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,Hyundai
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,Honda
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,Maruti
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,Audi


In [135]:
df.drop('Name', axis= 1, inplace= True)

In [136]:
df.duplicated().sum()

2

In [137]:
df.drop_duplicates(inplace= True, ignore_index= True)

In [138]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,brand
0,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75,Maruti
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,Hyundai
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,Honda
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,Maruti
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,Audi


In [139]:
df.brand.nunique()

30

In [140]:
df.brand.value_counts()

brand
Maruti           1173
Hyundai          1058
Honda             600
Toyota            394
Mercedes-Benz     316
Volkswagen        314
Ford              294
Mahindra          268
BMW               261
Audi              235
Tata              183
Skoda             172
Renault           145
Chevrolet         120
Nissan             89
Land               57
Jaguar             40
Mitsubishi         27
Mini               26
Fiat               23
Volvo              21
Porsche            16
Jeep               15
Datsun             13
Force               3
ISUZU               2
Ambassador          1
Isuzu               1
Bentley             1
Lamborghini         1
Name: count, dtype: int64

In [141]:
df[(df.brand == 'Ambassador') | (df.brand == 'Force') | (df.brand == 'ISUZU') | (df.brand == 'Isuzu') | (df.brand == 'Bentley') | (df.brand == 'Lamborghini')]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,brand
1188,Chennai,2003,80000,Diesel,Manual,Third,12.8,1489.0,35.5,5.0,1.35,Ambassador
2819,Jaipur,2017,34429,Diesel,Automatic,First,13.8,2999.0,174.57,7.0,20.0,Isuzu
3528,Coimbatore,2018,20422,Diesel,Manual,First,12.4,2499.0,134.0,5.0,16.09,ISUZU
4867,Chennai,2014,50000,Diesel,Manual,First,17.0,2200.0,139.01,7.0,12.0,Force
5049,Kolkata,2014,41000,Diesel,Manual,First,17.0,2149.0,139.07,7.0,8.0,Force
5072,Kolkata,2014,41000,Diesel,Manual,First,17.0,2200.0,139.01,7.0,8.0,Force
5382,Hyderabad,2006,48000,Petrol,Automatic,First,8.6,5998.0,552.0,4.0,59.0,Bentley
5637,Delhi,2011,6500,Petrol,Automatic,Third,6.4,5204.0,560.0,2.0,120.0,Lamborghini
5830,Jaipur,2017,25000,Diesel,Manual,First,12.4,2499.0,134.0,5.0,8.0,ISUZU


In [142]:
drop_idx = df[(df.brand == 'Ambassador') | (df.brand == 'Force') | (df.brand == 'ISUZU') | (df.brand == 'Isuzu') | (df.brand == 'Bentley') | (df.brand == 'Lamborghini')].index
drop_idx

Index([1188, 2819, 3528, 4867, 5049, 5072, 5382, 5637, 5830], dtype='int64')

In [143]:
df.drop(drop_idx, inplace= True)

df.reset_index(inplace= True, drop= True)

In [144]:
df.brand.value_counts()

brand
Maruti           1173
Hyundai          1058
Honda             600
Toyota            394
Mercedes-Benz     316
Volkswagen        314
Ford              294
Mahindra          268
BMW               261
Audi              235
Tata              183
Skoda             172
Renault           145
Chevrolet         120
Nissan             89
Land               57
Jaguar             40
Mitsubishi         27
Mini               26
Fiat               23
Volvo              21
Porsche            16
Jeep               15
Datsun             13
Name: count, dtype: int64

In [145]:
df[df.Mileage == 0]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,brand
14,Pune,2012,85000,Diesel,Automatic,Second,0.0,2179.0,115.0,5.0,17.5,Land
67,Coimbatore,2019,15369,Diesel,Automatic,First,0.0,1950.0,194.0,5.0,35.67,Mercedes-Benz
526,Kochi,2019,13190,Petrol,Automatic,First,0.0,1950.0,181.43,5.0,38.99,Mercedes-Benz
685,Pune,2014,120000,Diesel,Automatic,First,0.0,2987.0,165.0,5.0,30.0,Mercedes-Benz
932,Mumbai,2018,8682,Diesel,Automatic,First,0.0,1950.0,194.0,5.0,39.5,Mercedes-Benz
966,Pune,2008,93000,Petrol,Manual,First,0.0,1086.0,62.0,5.0,1.45,Hyundai
1029,Hyderabad,2010,58163,Petrol,Manual,First,0.0,1086.0,62.0,5.0,2.45,Hyundai
1225,Bangalore,2010,125000,Diesel,Automatic,Second,0.0,2179.0,115.0,5.0,11.0,Land
1272,Bangalore,2014,33000,Diesel,Automatic,Second,0.0,2987.0,165.0,5.0,43.0,Mercedes-Benz
1315,Kochi,2011,20842,Petrol,Manual,First,0.0,1086.0,62.0,5.0,2.78,Hyundai


In [146]:
drop_index = df[df.Mileage == 0].index
drop_index

Index([  14,   67,  526,  685,  932,  966, 1029, 1225, 1272, 1315, 1717, 2603,
       2959, 3007, 3122, 3182, 4119, 4187, 4295, 4566, 4582, 4881, 4887, 5169,
       5232, 5721, 5815, 5852],
      dtype='int64')

In [147]:
df.drop(drop_index, inplace= True)

df.reset_index(inplace= True, drop= True)

In [148]:
df

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,brand
0,Mumbai,2010,72000,CNG,Manual,First,26.60,998.0,58.16,5.0,1.75,Maruti
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.50,Hyundai
2,Chennai,2011,46000,Petrol,Manual,First,18.20,1199.0,88.70,5.0,4.50,Honda
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.00,Maruti
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.74,Audi
...,...,...,...,...,...,...,...,...,...,...,...,...
5827,Delhi,2014,27365,Diesel,Manual,First,28.40,1248.0,74.00,5.0,4.75,Maruti
5828,Jaipur,2015,100000,Diesel,Manual,First,24.40,1120.0,71.00,5.0,4.00,Hyundai
5829,Jaipur,2012,55000,Diesel,Manual,Second,14.00,2498.0,112.00,8.0,2.90,Mahindra
5830,Kolkata,2013,46000,Petrol,Manual,First,18.90,998.0,67.10,5.0,2.65,Maruti


# Data Preprocessing

### 1) Split Data into input features and Target Variable

In [149]:
x = df.drop('Price', axis= 1)
y = df['Price']

### 2) Split Data into Train & Test

In [150]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 0)

### 3) Handling Numerical Columns

In [151]:
scaling_cols = x_train.select_dtypes(include= 'number').columns
scaling_cols

Index(['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats'], dtype='object')

In [152]:
from sklearn.preprocessing import RobustScaler

rc = RobustScaler()

x_train[scaling_cols] = rc.fit_transform(x_train[scaling_cols])
x_test[scaling_cols] = rc.transform(x_test[scaling_cols])

### 4) Handling Categorical Columns

In [153]:
x_train.select_dtypes(include= 'object')

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,brand
1185,Delhi,Petrol,Automatic,First,Toyota
5757,Mumbai,Diesel,Manual,First,Toyota
1371,Pune,Petrol,Automatic,First,Honda
1563,Chennai,Diesel,Manual,First,Hyundai
1843,Ahmedabad,Diesel,Automatic,First,Toyota
...,...,...,...,...,...
4931,Mumbai,Diesel,Manual,First,Hyundai
3264,Bangalore,Petrol,Manual,First,Hyundai
1653,Delhi,Petrol,Automatic,First,Maruti
2607,Pune,Diesel,Manual,First,Tata


In [154]:
nominal_cols = x_train.select_dtypes(include= 'object').drop('Owner_Type', axis = 1).columns
nominal_cols

Index(['Location', 'Fuel_Type', 'Transmission', 'brand'], dtype='object')

In [155]:
ordinal_cols = ['Owner_Type']
ordinal_cols

['Owner_Type']

In [156]:
for col in nominal_cols:
    print(col)
    print(df[col].nunique())

Location
11
Fuel_Type
4
Transmission
2
brand
24


In [157]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output= False, drop= 'first')

ohe_train_arr = ohe.fit_transform(x_train[nominal_cols])
ohe_test_arr = ohe.transform(x_test[nominal_cols])

In [158]:
ohe_train_df = pd.DataFrame(ohe_train_arr, columns= ohe.get_feature_names_out())
ohe_test_df = pd.DataFrame(ohe_test_arr, columns= ohe.get_feature_names_out())
ohe_train_df

Unnamed: 0,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,Location_Hyderabad,Location_Jaipur,Location_Kochi,Location_Kolkata,Location_Mumbai,Location_Pune,...,brand_Mini,brand_Mitsubishi,brand_Nissan,brand_Porsche,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4661,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4662,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [159]:
x_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,brand
1185,Delhi,0.75,-0.814677,Petrol,Automatic,First,-0.737762,0.396104,0.714422,0.0,Toyota
5757,Mumbai,-0.50,1.344383,Diesel,Manual,First,-0.963287,1.300000,0.111058,3.0,Toyota
1371,Pune,0.50,-0.814677,Petrol,Automatic,First,0.087413,-0.381818,-0.068221,0.0,Honda
1563,Chennai,-0.50,1.903198,Diesel,Manual,First,0.667832,0.115584,0.528637,0.0,Hyundai
1843,Ahmedabad,0.75,-0.967081,Diesel,Automatic,First,-0.979021,1.638961,1.293035,2.0,Toyota
...,...,...,...,...,...,...,...,...,...,...,...
4931,Mumbai,-0.25,0.664279,Diesel,Manual,First,0.611888,0.115584,0.526733,0.0,Hyundai
3264,Bangalore,0.25,-0.078056,Petrol,Manual,First,0.069930,-0.384416,-0.176741,0.0,Hyundai
1653,Delhi,0.75,-0.230461,Petrol,Automatic,First,0.506993,-0.384416,-0.157068,0.0,Maruti
2607,Pune,-2.00,2.716020,Diesel,Manual,First,-0.262238,-0.114286,-0.364906,0.0,Tata


In [160]:
x_train.reset_index(inplace= True, drop= True)
x_test.reset_index(inplace= True, drop= True)

y_train.reset_index(inplace= True, drop= True)
y_test.reset_index(inplace= True, drop= True)

In [161]:
x_train = pd.concat([x_train, ohe_train_df], axis= 1).drop(nominal_cols, axis= 1)

x_test = pd.concat([x_test, ohe_test_df], axis= 1).drop(nominal_cols, axis= 1)

In [162]:
from sklearn.preprocessing import OrdinalEncoder

ord = OrdinalEncoder(categories= [['Fourth & Above', 'Third', 'Second', 'First']])

x_train['Owner_Type'] = ord.fit_transform(x_train[['Owner_Type']])
x_test['Owner_Type'] = ord.transform(x_test[['Owner_Type']])

In [163]:
x_train.head()

Unnamed: 0,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Location_Bangalore,Location_Chennai,Location_Coimbatore,...,brand_Mini,brand_Mitsubishi,brand_Nissan,brand_Porsche,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo
0,0.75,-0.814677,3.0,-0.737762,0.396104,0.714422,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.5,1.344383,3.0,-0.963287,1.3,0.111058,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.5,-0.814677,3.0,0.087413,-0.381818,-0.068221,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.5,1.903198,3.0,0.667832,0.115584,0.528637,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.75,-0.967081,3.0,-0.979021,1.638961,1.293035,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Machine Learning

### Linear Regression Without Scaling

In [164]:
# from sklearn.linear_model import LinearRegression

# le = LinearRegression()

# le.fit(x_train, y_train)

# round(le.score(x_test, y_test) * 100, 2)

### Gradient Descent Without Scaling

In [165]:
# from sklearn.linear_model import SGDRegressor

# sgd = SGDRegressor()

# sgd.fit(x_train, y_train)

# round(sgd.score(x_test, y_test) * 100, 2)

### Linear Regression With Scaling

In [166]:
from sklearn.linear_model import LinearRegression

le = LinearRegression()

le.fit(x_train, y_train)

print('Train Score :', round(le.score(x_train, y_train) * 100, 2))
print('Test Score :', round(le.score(x_test, y_test) * 100, 2))

Train Score : 79.47
Test Score : 72.11


In [167]:
from sklearn.metrics import root_mean_squared_error

le = LinearRegression()

le.fit(x_train, y_train)

y_pred = le.predict(x_test)

round(root_mean_squared_error(y_test, y_pred), 2)

6.6

In [168]:
le.predict(x_test)

array([ 3.32049066,  2.50672622,  7.45931392, ...,  4.28820638,
        8.74919087, 10.89018227])

### Gradient Descent With Scaling

In [169]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()

sgd.fit(x_train, y_train)

print('Train Score :', round(sgd.score(x_train, y_train) * 100, 2))
print('Test Score :', round(sgd.score(x_test, y_test) * 100, 2))

Train Score : 78.08
Test Score : 69.76


In [170]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()

sgd.fit(x_train, y_train)

y_pred = sgd.predict(x_test)

round(root_mean_squared_error(y_test, y_pred), 2)

6.8

### Polynomial with degree = 2

In [171]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

le_poly = LinearRegression()

le_poly.fit(x_train_poly, y_train)

print('Train Score :', round(le_poly.score(x_train_poly, y_train) * 100, 2))
print('Test Score :', round(le_poly.score(x_test_poly, y_test) * 100, 2))

Train Score : 95.85
Test Score : -4.311572802492705e+17


### Polynomial with degree = 3

In [172]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree= 3)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

le_poly = LinearRegression()

le_poly.fit(x_train_poly, y_train)

print('Train Score :', round(le_poly.score(x_train_poly, y_train) * 100, 2))
print('Test Score :', round(le_poly.score(x_test_poly, y_test) * 100, 2))

Train Score : 99.36
Test Score : -29738.05


In [173]:
x_train_poly.shape

(4665, 16215)

### Lasso Only

In [174]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha= 0.01)

lasso.fit(x_train, y_train)

print('Train Score :', round(lasso.score(x_train, y_train) * 100, 2))
print('Test Score :', round(lasso.score(x_test, y_test) * 100, 2))

Train Score : 78.59
Test Score : 70.86


In [175]:
lasso.coef_

array([ 3.3794252 , -0.76135511,  0.23013946, -0.41370474,  1.25860656,
        6.59566543, -0.14909651,  1.50075912,  0.69445334,  1.75201223,
       -0.77527105,  1.34240894,  0.41254673, -0.20399704, -1.30472775,
       -0.87803833, -0.        ,  0.        ,  0.        , -1.24306265,
       -0.77695225,  1.48794137, -3.31689495, -0.        , -0.12633072,
       -2.65669112, -4.50706875, -3.48312143,  5.31595327, -0.30567278,
       13.05585892, -5.4942245 , -1.79442826,  3.60923316,  6.77624173,
       -1.04425301, -2.90090193,  7.23175813, -3.54089445, -4.2388072 ,
       -3.6414336 , -1.97141187, -3.87610468, -0.06380655])

In [176]:
lasso.feature_names_in_

array(['Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine',
       'Power', 'Seats', 'Location_Bangalore', 'Location_Chennai',
       'Location_Coimbatore', 'Location_Delhi', 'Location_Hyderabad',
       'Location_Jaipur', 'Location_Kochi', 'Location_Kolkata',
       'Location_Mumbai', 'Location_Pune', 'Fuel_Type_Diesel',
       'Fuel_Type_LPG', 'Fuel_Type_Petrol', 'Transmission_Manual',
       'brand_BMW', 'brand_Chevrolet', 'brand_Datsun', 'brand_Fiat',
       'brand_Ford', 'brand_Honda', 'brand_Hyundai', 'brand_Jaguar',
       'brand_Jeep', 'brand_Land', 'brand_Mahindra', 'brand_Maruti',
       'brand_Mercedes-Benz', 'brand_Mini', 'brand_Mitsubishi',
       'brand_Nissan', 'brand_Porsche', 'brand_Renault', 'brand_Skoda',
       'brand_Tata', 'brand_Toyota', 'brand_Volkswagen', 'brand_Volvo'],
      dtype=object)

In [177]:
lasso_df = pd.DataFrame({'Feature' : lasso.feature_names_in_,
                         'Coeff' : lasso.coef_})

lasso_df[lasso_df.Coeff != 0]

Unnamed: 0,Feature,Coeff
0,Year,3.379425
1,Kilometers_Driven,-0.761355
2,Owner_Type,0.230139
3,Mileage,-0.413705
4,Engine,1.258607
5,Power,6.595665
6,Seats,-0.149097
7,Location_Bangalore,1.500759
8,Location_Chennai,0.694453
9,Location_Coimbatore,1.752012


### Polynomial with Lasso

In [178]:
from sklearn.linear_model import Lasso

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

lasso_poly = Lasso(alpha= 0.001)

lasso_poly.fit(x_train_poly, y_train)

print('Train Score :', round(lasso_poly.score(x_train_poly, y_train) * 100, 2))
print('Test Score :', round(lasso_poly.score(x_test_poly, y_test) * 100, 2))

Train Score : 95.31
Test Score : 87.77



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.023e+03, tolerance: 5.305e+01



In [179]:
x_train_poly.shape

(4665, 1035)

In [180]:
lasso_poly.coef_

array([ 0.        ,  4.93585532, -1.51885828, ..., -0.        ,
        0.        ,  0.        ])

In [181]:
poly.get_feature_names_out()

array(['1', 'Year', 'Kilometers_Driven', ..., 'brand_Volkswagen^2',
       'brand_Volkswagen brand_Volvo', 'brand_Volvo^2'], dtype=object)

In [182]:
lasso_df = pd.DataFrame({'Feature' : poly.get_feature_names_out(),
                         'Coeff' : lasso_poly.coef_})

lasso_df[(lasso_df.Coeff != 0)]

Unnamed: 0,Feature,Coeff
1,Year,4.935855
2,Kilometers_Driven,-1.518858
3,Owner_Type,1.507781
4,Mileage,-1.136906
5,Engine,2.602957
...,...,...
930,brand_Land^2,-0.434859
944,brand_Mahindra^2,-0.113162
957,brand_Maruti^2,1.127667
969,brand_Mercedes-Benz^2,-1.831933


### Polynomial with Ridge

In [183]:
from sklearn.linear_model import Ridge

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

ridge_poly = Ridge(alpha= 1)

ridge_poly.fit(x_train_poly, y_train)

print('Train Score :', round(ridge_poly.score(x_train_poly, y_train) * 100, 2))
print('Test Score :', round(ridge_poly.score(x_test_poly, y_test) * 100, 2))

Train Score : 95.42
Test Score : 88.0


In [184]:
ridge_poly.coef_

array([ 0.        ,  4.71968752, -2.08828337, ..., -1.08320059,
        0.        ,  0.10731031])

### ElasticNet with Ridge

In [185]:
from sklearn.linear_model import ElasticNet

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

elastic_poly = ElasticNet(alpha= 0.001, l1_ratio= 0.5)

elastic_poly.fit(x_train_poly, y_train)

print('Train Score :', round(elastic_poly.score(x_train_poly, y_train) * 100, 2))
print('Test Score :', round(elastic_poly.score(x_test_poly, y_test) * 100, 2))

Train Score : 94.98
Test Score : 87.83



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.028e+04, tolerance: 5.305e+01



In [186]:
y_train

0       18.00
1        8.40
2        6.35
3        5.00
4       30.00
        ...  
4660     5.50
4661     4.75
4662     6.20
4663     1.00
4664     3.29
Name: Price, Length: 4665, dtype: float64

In [187]:
np.log(y_train)

0       2.890372
1       2.128232
2       1.848455
3       1.609438
4       3.401197
          ...   
4660    1.704748
4661    1.558145
4662    1.824549
4663    0.000000
4664    1.190888
Name: Price, Length: 4665, dtype: float64

### Linear Regression With Target Scaling

In [188]:
from sklearn.linear_model import LinearRegression

le = LinearRegression()

le.fit(x_train, np.log(y_train))

print('Train Score :', round(le.score(x_train, np.log(y_train)) * 100, 2))
print('Test Score :', round(le.score(x_test, np.log(y_test)) * 100, 2))

Train Score : 92.7
Test Score : 92.39


### Gradient Descent With Target Scaling

In [189]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()

sgd.fit(x_train, np.log(y_train))

print('Train Score :', round(sgd.score(x_train, np.log(y_train)) * 100, 2))
print('Test Score :', round(sgd.score(x_test, np.log(y_test)) * 100, 2))

Train Score : 90.42
Test Score : 89.82


### Polynomial With Target Scaling

In [190]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

le_poly = LinearRegression()

le_poly.fit(x_train_poly, np.log(y_train))

print('Train Score :', round(le_poly.score(x_train_poly, np.log(y_train)) * 100, 2))
print('Test Score :', round(le_poly.score(x_test_poly, np.log(y_test)) * 100, 2))

Train Score : 96.42
Test Score : -4.071766084656008e+18


### Polynomial With Lasso with Target Scaling

In [191]:
from sklearn.linear_model import Lasso

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

lasso_poly = Lasso(alpha= 0.001)

lasso_poly.fit(x_train_poly, np.log(y_train))

print('Train Score :', round(lasso_poly.score(x_train_poly, np.log(y_train)) * 100, 2))
print('Test Score :', round(lasso_poly.score(x_test_poly, np.log(y_test)) * 100, 2))

Train Score : 94.46
Test Score : 93.59


### Polynomial with Ridge with Target Scaling

In [192]:
from sklearn.linear_model import Ridge

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

ridge_poly = Ridge(alpha= 1)

ridge_poly.fit(x_train_poly, np.log(y_train))

print('Train Score :', round(ridge_poly.score(x_train_poly, np.log(y_train)) * 100, 2))
print('Test Score :', round(ridge_poly.score(x_test_poly, np.log(y_test)) * 100, 2))

Train Score : 96.22
Test Score : 93.43


### Polynomial with ElasticNet with Target Scaling

In [193]:
from sklearn.linear_model import ElasticNet

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

elastic_poly = ElasticNet(alpha= 0.001, l1_ratio= 0.5)

elastic_poly.fit(x_train_poly, np.log(y_train))

print('Train Score :', round(elastic_poly.score(x_train_poly, np.log(y_train)) * 100, 2))
print('Test Score :', round(elastic_poly.score(x_test_poly, np.log(y_test)) * 100, 2))

Train Score : 95.0
Test Score : 93.91



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.156e+00, tolerance: 3.403e-01



In [194]:
y_train

0       18.00
1        8.40
2        6.35
3        5.00
4       30.00
        ...  
4660     5.50
4661     4.75
4662     6.20
4663     1.00
4664     3.29
Name: Price, Length: 4665, dtype: float64

In [195]:
np.log(y_train)

0       2.890372
1       2.128232
2       1.848455
3       1.609438
4       3.401197
          ...   
4660    1.704748
4661    1.558145
4662    1.824549
4663    0.000000
4664    1.190888
Name: Price, Length: 4665, dtype: float64

In [196]:
np.exp(2.890372)

18.000004357869567