In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [112]:
data = pd.read_csv('Travel.csv')
data

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,204883,1,49.0,Self Enquiry,3,9.0,Small Business,Male,3,5.0,Deluxe,4.0,Unmarried,2.0,1,1,1,1.0,Manager,26576.0
4884,204884,1,28.0,Company Invited,1,31.0,Salaried,Male,4,5.0,Basic,3.0,Single,3.0,1,3,1,2.0,Executive,21212.0
4885,204885,1,52.0,Self Enquiry,3,17.0,Salaried,Female,4,4.0,Standard,4.0,Married,7.0,0,1,1,3.0,Senior Manager,31820.0
4886,204886,1,19.0,Self Enquiry,3,16.0,Small Business,Male,3,4.0,Basic,3.0,Single,3.0,0,5,0,2.0,Executive,20289.0


# Feature Engineering

### Data Cleaning

In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

In [114]:
data.describe()

Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,MonthlyIncome
count,4888.0,4888.0,4662.0,4888.0,4637.0,4888.0,4843.0,4862.0,4748.0,4888.0,4888.0,4888.0,4822.0,4655.0
mean,202443.5,0.188216,37.622265,1.654255,15.490835,2.905074,3.708445,3.581037,3.236521,0.290917,3.078151,0.620295,1.187267,23619.853491
std,1411.188388,0.390925,9.316387,0.916583,8.519643,0.724891,1.002509,0.798009,1.849019,0.454232,1.365792,0.485363,0.857861,5380.698361
min,200000.0,0.0,18.0,1.0,5.0,1.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,1000.0
25%,201221.75,0.0,31.0,1.0,9.0,2.0,3.0,3.0,2.0,0.0,2.0,0.0,1.0,20346.0
50%,202443.5,0.0,36.0,1.0,13.0,3.0,4.0,3.0,3.0,0.0,3.0,1.0,1.0,22347.0
75%,203665.25,0.0,44.0,3.0,20.0,3.0,4.0,4.0,4.0,1.0,4.0,1.0,2.0,25571.0
max,204887.0,1.0,61.0,3.0,127.0,5.0,6.0,5.0,22.0,1.0,5.0,1.0,3.0,98678.0


In [115]:
data.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [116]:
cat_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation']
for col in cat_columns:
    print(data[col].value_counts())

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64
Occupation
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: count, dtype: int64
Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64
ProductPitched
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: count, dtype: int64
MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64
Designation
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: count, dtype: int64


In [117]:
data['Gender'] = data['Gender'].replace('Fe male', 'Female')
data['MaritalStatus'] = data['MaritalStatus'].replace('Single', 'Unmarried')

Selecting which columns have nan values

In [118]:
nan_cols = [feature for feature in data.columns if data[feature].isnull().sum() >= 1]
nan_cols

['Age',
 'TypeofContact',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

Selecting which nan values will be replaced with medain or mode values of there columns

In [119]:
median_cols = ['Age', 'DurationOfPitch', 'NumberOfTrips', 'MonthlyIncome']
mode_cols = [feature for  feature in nan_cols if feature not in median_cols]
median_cols, mode_cols

(['Age', 'DurationOfPitch', 'NumberOfTrips', 'MonthlyIncome'],
 ['TypeofContact',
  'NumberOfFollowups',
  'PreferredPropertyStar',
  'NumberOfChildrenVisiting'])

In [120]:
for col in median_cols:
    data[col].fillna(data[col].median(), inplace=True)
for col in mode_cols:
    data[col].fillna(data[col].mode(), inplace=True)
data

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,204883,1,49.0,Self Enquiry,3,9.0,Small Business,Male,3,5.0,Deluxe,4.0,Unmarried,2.0,1,1,1,1.0,Manager,26576.0
4884,204884,1,28.0,Company Invited,1,31.0,Salaried,Male,4,5.0,Basic,3.0,Unmarried,3.0,1,3,1,2.0,Executive,21212.0
4885,204885,1,52.0,Self Enquiry,3,17.0,Salaried,Female,4,4.0,Standard,4.0,Married,7.0,0,1,1,3.0,Senior Manager,31820.0
4886,204886,1,19.0,Self Enquiry,3,16.0,Small Business,Male,3,4.0,Basic,3.0,Unmarried,3.0,0,5,0,2.0,Executive,20289.0


In [121]:
data.drop(columns=['CustomerID'], axis=1, inplace=True)

In [122]:
x = data.drop('ProdTaken', axis=1)
y = data['ProdTaken']
x, y

(       Age    TypeofContact  CityTier  DurationOfPitch      Occupation  \
 0     41.0     Self Enquiry         3              6.0        Salaried   
 1     49.0  Company Invited         1             14.0        Salaried   
 2     37.0     Self Enquiry         1              8.0     Free Lancer   
 3     33.0  Company Invited         1              9.0        Salaried   
 4     36.0     Self Enquiry         1              8.0  Small Business   
 ...    ...              ...       ...              ...             ...   
 4883  49.0     Self Enquiry         3              9.0  Small Business   
 4884  28.0  Company Invited         1             31.0        Salaried   
 4885  52.0     Self Enquiry         3             17.0        Salaried   
 4886  19.0     Self Enquiry         3             16.0  Small Business   
 4887  36.0     Self Enquiry         1             14.0        Salaried   
 
       Gender  NumberOfPersonVisiting  NumberOfFollowups ProductPitched  \
 0     Female          

In [123]:
cat_cols = [feature for feature in x.columns if data[feature].dtype == 'O']
data[cat_cols]

Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
0,Self Enquiry,Salaried,Female,Deluxe,Unmarried,Manager
1,Company Invited,Salaried,Male,Deluxe,Divorced,Manager
2,Self Enquiry,Free Lancer,Male,Basic,Unmarried,Executive
3,Company Invited,Salaried,Female,Basic,Divorced,Executive
4,Self Enquiry,Small Business,Male,Basic,Divorced,Executive
...,...,...,...,...,...,...
4883,Self Enquiry,Small Business,Male,Deluxe,Unmarried,Manager
4884,Company Invited,Salaried,Male,Basic,Unmarried,Executive
4885,Self Enquiry,Salaried,Female,Standard,Married,Senior Manager
4886,Self Enquiry,Small Business,Male,Basic,Unmarried,Executive


In [124]:
num_cols = [feature for feature in x.columns if feature not in cat_cols]
data[num_cols]

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,MonthlyIncome
0,41.0,3,6.0,3,3.0,3.0,1.0,1,2,1,0.0,20993.0
1,49.0,1,14.0,3,4.0,4.0,2.0,0,3,1,2.0,20130.0
2,37.0,1,8.0,3,4.0,3.0,7.0,1,3,0,0.0,17090.0
3,33.0,1,9.0,2,3.0,3.0,2.0,1,5,1,1.0,17909.0
4,36.0,1,8.0,2,3.0,4.0,1.0,0,5,1,0.0,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4883,49.0,3,9.0,3,5.0,4.0,2.0,1,1,1,1.0,26576.0
4884,28.0,1,31.0,4,5.0,3.0,3.0,1,3,1,2.0,21212.0
4885,52.0,3,17.0,4,4.0,4.0,7.0,0,1,1,3.0,31820.0
4886,19.0,3,16.0,3,4.0,3.0,3.0,0,5,0,2.0,20289.0


In [125]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)
x_train.shape, y_train.shape

((3421, 18), (3421,))

# Model Training

In [126]:
encoder = OneHotEncoder(drop='first')
scaler = StandardScaler()

transformer = ColumnTransformer(
    [
        ("OneHotEncoder", encoder, cat_cols),
        ("StandardScaler", scaler, num_cols)
    ]
)
transformer

In [127]:
x_train = transformer.fit_transform(x_train)
x_train

array([[ 1.        ,  0.        ,  0.        , ...,  0.78143477,
        -0.22165945,  0.4603181 ],
       [ 1.        ,  0.        ,  1.        , ..., -1.27969734,
        -0.22165945, -1.00566841],
       [ 1.        ,  0.        ,  0.        , ...,  0.78143477,
        -0.22165945, -1.2097217 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.78143477,
         0.94257055, -0.35739958],
       [ 1.        ,  0.        ,  0.        , ...,  0.78143477,
         0.94257055, -0.25076528],
       [ 0.        ,  0.        ,  0.        , ..., -1.27969734,
        -1.38588944, -1.07168012]])

In [128]:
x_test = transformer.transform(x_test)
x_test

array([[ 0.        ,  0.        ,  0.        , ..., -1.27969734,
        -0.22165945, -0.73033753],
       [ 1.        ,  0.        ,  0.        , ..., -1.27969734,
        -0.22165945, -0.66394968],
       [ 1.        ,  0.        ,  0.        , ...,  0.78143477,
        -1.38588944, -0.417017  ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.78143477,
        -0.22165945, -0.08075222],
       [ 1.        ,  0.        ,  0.        , ...,  0.78143477,
        -0.22165945,  1.97576639],
       [ 1.        ,  0.        ,  0.        , ..., -1.27969734,
        -1.38588944,  1.28837951]])

In [129]:
models = {
    "Decision Tree" : DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier()
}

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(x_train, y_train)
    print(list(models.keys())[i])
    pred = model.predict(x_test)
    score = accuracy_score(y_test, pred)
    cm = confusion_matrix(y_test, pred)
    print(score, '\n', cm, '\n', classification_report(y_test, pred))
    print(roc_auc_score(y_test, pred))
    print()
    print() 

Decision Tree
0.9107021131561008 
 [[1128   65]
 [  66  208]] 
               precision    recall  f1-score   support

           0       0.94      0.95      0.95      1193
           1       0.76      0.76      0.76       274

    accuracy                           0.91      1467
   macro avg       0.85      0.85      0.85      1467
weighted avg       0.91      0.91      0.91      1467

0.852319797358068


Random Forest
0.9175187457396047 
 [[1181   12]
 [ 109  165]] 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95      1193
           1       0.93      0.60      0.73       274

    accuracy                           0.92      1467
   macro avg       0.92      0.80      0.84      1467
weighted avg       0.92      0.92      0.91      1467

0.7960655527070931




# Random Forest Regressor

In [150]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [131]:
data = pd.read_csv('cardekho_dataset.csv', index_col=0)
data

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19537,Hyundai i10,Hyundai,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5,250000
19540,Maruti Ertiga,Maruti,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7,925000
19541,Skoda Rapid,Skoda,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5,425000
19542,Mahindra XUV500,Mahindra,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7,1225000


In [132]:
data = data.reindex(index = [x for x in range(len(data))])
data

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9.0,120000.0,Individual,Petrol,Manual,19.70,796.0,46.30,5.0,120000.0
1,Hyundai Grand,Hyundai,Grand,5.0,20000.0,Individual,Petrol,Manual,18.90,1197.0,82.00,5.0,550000.0
2,Hyundai i20,Hyundai,i20,11.0,60000.0,Individual,Petrol,Manual,17.00,1197.0,80.00,5.0,215000.0
3,Maruti Alto,Maruti,Alto,9.0,37000.0,Individual,Petrol,Manual,20.92,998.0,67.10,5.0,226000.0
4,Ford Ecosport,Ford,Ecosport,6.0,30000.0,Dealer,Diesel,Manual,22.77,1498.0,98.59,5.0,570000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,Audi A4,Audi,A4,7.0,98300.0,Dealer,Diesel,Automatic,17.11,1968.0,174.33,5.0,1350000.0
15407,,,,,,,,,,,,,
15408,Maruti Ertiga,Maruti,Ertiga,3.0,11128.0,Dealer,CNG,Manual,17.50,1373.0,80.46,7.0,875000.0
15409,,,,,,,,,,,,,


In [133]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15411 entries, 0 to 15410
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_name           12150 non-null  object 
 1   brand              12150 non-null  object 
 2   model              12150 non-null  object 
 3   vehicle_age        12150 non-null  float64
 4   km_driven          12150 non-null  float64
 5   seller_type        12150 non-null  object 
 6   fuel_type          12150 non-null  object 
 7   transmission_type  12150 non-null  object 
 8   mileage            12150 non-null  float64
 9   engine             12150 non-null  float64
 10  max_power          12150 non-null  float64
 11  seats              12150 non-null  float64
 12  selling_price      12150 non-null  float64
dtypes: float64(7), object(6)
memory usage: 1.6+ MB


In [134]:
data.describe()

Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,selling_price
count,12150.0,12150.0,12150.0,12150.0,12150.0,12150.0,12150.0
mean,6.026255,55446.6,19.740071,1484.032016,100.189301,5.33177,765049.1
std,3.011387,43208.9,4.175324,523.047129,43.166001,0.814998,915413.8
min,0.0,100.0,4.0,793.0,38.4,0.0,40000.0
25%,4.0,30000.0,17.0,1197.0,74.0,5.0,380000.0
50%,6.0,50000.0,19.7,1248.0,88.5,5.0,550000.0
75%,8.0,70000.0,22.7,1582.0,117.3,5.0,800000.0
max,29.0,1325000.0,33.54,6592.0,626.0,9.0,39500000.0


In [135]:
data.isnull().sum()

car_name             3261
brand                3261
model                3261
vehicle_age          3261
km_driven            3261
seller_type          3261
fuel_type            3261
transmission_type    3261
mileage              3261
engine               3261
max_power            3261
seats                3261
selling_price        3261
dtype: int64

In [136]:
for col in data.columns:
    print(data[col].value_counts())

car_name
Maruti Swift Dzire       725
Hyundai i20              715
Maruti Alto              621
Maruti Swift             611
Maruti Wagon R           599
                        ... 
Jeep Wrangler              1
ISUZU MUX                  1
Hyundai Aura               1
Maserati Ghibli            1
Maserati Quattroporte      1
Name: count, Length: 119, dtype: int64
brand
Maruti           3963
Hyundai          2348
Honda            1169
Mahindra          832
Ford              634
Toyota            619
Volkswagen        461
Renault           420
Tata              348
BMW               326
Mercedes-Benz     260
Skoda             243
Datsun            145
Audi              141
Jaguar             46
Land Rover         41
Jeep               31
Kia                25
Volvo              20
Porsche            19
MG                 17
Mini               13
Lexus               8
Nissan              7
Isuzu               6
Maserati            2
Bentley             2
ISUZU               1
Ferrari    

In [137]:
data.drop(columns=['car_name', 'brand', 'model'], axis = 1, inplace=True)
data

Unnamed: 0,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,9.0,120000.0,Individual,Petrol,Manual,19.70,796.0,46.30,5.0,120000.0
1,5.0,20000.0,Individual,Petrol,Manual,18.90,1197.0,82.00,5.0,550000.0
2,11.0,60000.0,Individual,Petrol,Manual,17.00,1197.0,80.00,5.0,215000.0
3,9.0,37000.0,Individual,Petrol,Manual,20.92,998.0,67.10,5.0,226000.0
4,6.0,30000.0,Dealer,Diesel,Manual,22.77,1498.0,98.59,5.0,570000.0
...,...,...,...,...,...,...,...,...,...,...
15406,7.0,98300.0,Dealer,Diesel,Automatic,17.11,1968.0,174.33,5.0,1350000.0
15407,,,,,,,,,,
15408,3.0,11128.0,Dealer,CNG,Manual,17.50,1373.0,80.46,7.0,875000.0
15409,,,,,,,,,,


In [138]:
median_cols = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'selling_price']
mode_cols = ['seller_type', 'fuel_type', 'transmission_type', 'seats']

for col in median_cols:
    data[col].fillna(data[col].median(), inplace=True)
for col in mode_cols:
    data[col].fillna(data[col].mode().values[0], inplace = True)
data.isnull().sum()

vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [139]:
x = data.drop('selling_price', axis=1)
y = data['selling_price']
x, y

(       vehicle_age  km_driven seller_type fuel_type transmission_type  \
 0              9.0   120000.0  Individual    Petrol            Manual   
 1              5.0    20000.0  Individual    Petrol            Manual   
 2             11.0    60000.0  Individual    Petrol            Manual   
 3              9.0    37000.0  Individual    Petrol            Manual   
 4              6.0    30000.0      Dealer    Diesel            Manual   
 ...            ...        ...         ...       ...               ...   
 15406          7.0    98300.0      Dealer    Diesel         Automatic   
 15407          6.0    50000.0      Dealer    Petrol            Manual   
 15408          3.0    11128.0      Dealer       CNG            Manual   
 15409          6.0    50000.0      Dealer    Petrol            Manual   
 15410          4.0    46871.0      Dealer    Petrol            Manual   
 
        mileage  engine  max_power  seats  
 0        19.70   796.0      46.30    5.0  
 1        18.90  1197.

In [140]:
cat_cols = [feature for feature in x.columns if x[feature].dtype == 'O']
num_cols = [feature for feature in x.columns if feature not in cat_cols]
cat_cols, num_cols

(['seller_type', 'fuel_type', 'transmission_type'],
 ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats'])

In [141]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)
x_train.shape, y_train.shape

((10787, 9), (10787,))

In [142]:
transformer = ColumnTransformer(
    [
        ("OneHotEncoder", OneHotEncoder(drop='first'), cat_cols),
        ("StandardScaler", StandardScaler(), num_cols)
    ]
)

x_train = transformer.fit_transform(x_train)
x_train

array([[ 1.        ,  0.        ,  1.        , ...,  2.27887662,
         0.93990193, -0.34783549],
       [ 0.        ,  0.        ,  0.        , ..., -0.38952901,
        -0.23960557, -0.34783549],
       [ 0.        ,  1.        ,  0.        , ..., -0.92278353,
         0.5342032 , -0.34783549],
       ...,
       [ 1.        ,  0.        ,  1.        , ..., -0.38952901,
        -0.23960557, -0.34783549],
       [ 1.        ,  0.        ,  1.        , ..., -0.38952901,
        -0.61549257, -0.34783549],
       [ 0.        ,  0.        ,  0.        , ..., -0.38952901,
        -0.23960557, -0.34783549]])

In [144]:
x_test = transformer.transform(x_test)

In [151]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "KNeighborsRegressor" : KNeighborsRegressor(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RF Regressor" : RandomForestRegressor()
}

for i in range(len(models)):
    reg = list(models.values())[i]
    reg.fit(x_train, y_train)
    pred = reg.predict(x_test)
    print(list(models.keys())[i])
    print("MSE :", mean_squared_error(y_test, pred))
    print("r2_score :", r2_score(y_test, pred))
    print()
    print()

Linear Regression
MSE : 191050831888.01028
r2_score : 0.6521209608688492


Lasso
MSE : 191050362837.13284
r2_score : 0.6521218149502835


Ridge
MSE : 191028042258.48877
r2_score : 0.6521624578586385


KNeighborsRegressor
MSE : 58180626894.46367
r2_score : 0.8940605472371973


DecisionTreeRegressor
MSE : 197267245721.2971
r2_score : 0.640801669297101


RF Regressor
MSE : 38781048011.33566
r2_score : 0.9293846900044347


