In [None]:
import numpy as np
from pycaret.regression import *
import pandas as pd

data = pd.read_csv("vehicle-dataset-from-cardekho/car details v4.csv")

# convert data to normal
data['Price'] /= 81.28      # INR -> USD

data['Engine'] = data['Engine'].str.extract('(\d+)').astype(float)

data['Max Power (hp)'] = pd.to_numeric(data['Max Power'].str.extract(
    '(\d+\.?\d*)')[0], errors='coerce') * 1.01387  # bhp -> hp
data['Max Power (rpm)'] = pd.to_numeric(
    data['Max Power'].str.extract('@\s*(\d+)')[0], errors='coerce')

data['Max Torque (Nm)'] = pd.to_numeric(
    data['Max Torque'].str.extract('(\d+\.?\d*)')[0], errors='coerce')
data['Max Torque (rpm)'] = pd.to_numeric(
    data['Max Torque'].str.extract('@\s*(\d+)')[0], errors='coerce')

# convert owner to numeric, NaN for unknown
data['Owner'].replace('UnRegistered Car', np.nan, inplace=True)
owner_mapping = {
    'First': 1,
    'Second': 2,
    'Third': 3,
    'Fourth': 4,
    '4 or More': 5
}
data['Owner'] = data['Owner'].map(owner_mapping)

# rename columns to corect names
data.rename(
    columns={
        'Make': 'Brand',
        'Price': 'Price (USD)',
        'Kilometer': 'Mileage (km)',
        'Engine': 'Engine Capacity (cc)'
    },
    inplace=True)

# delete useless columns
data.drop(
    columns=[
        'Location',
        'Length',
        'Width',
        'Height',
        'Max Power',
        'Max Torque'
    ],
    inplace=True)

In [None]:
data.head()

Unnamed: 0,Brand,Model,Price (USD),Year,Mileage (km),Fuel Type,Transmission,Color,Owner,Seller Type,Engine Capacity (cc),Drivetrain,Seating Capacity,Fuel Tank Capacity,Max Power (hp),Max Power (rpm),Max Torque (Nm),Max Torque (rpm)
0,Honda,Amaze 1.2 VX i-VTEC,6213.090551,2017,87150,Petrol,Manual,Grey,1.0,Corporate,1198.0,FWD,5.0,35.0,88.20669,6000.0,109.0,4500.0
1,Maruti Suzuki,Swift DZire VDI,5536.417323,2014,75000,Diesel,Manual,White,2.0,Individual,1248.0,FWD,5.0,42.0,75.02638,4000.0,190.0,2000.0
2,Hyundai,i10 Magna 1.2 Kappa2,2706.692913,2011,67000,Petrol,Manual,Maroon,1.0,Individual,1197.0,FWD,5.0,35.0,80.09573,6000.0,112.7619,4000.0
3,Toyota,Glanza G,9830.216535,2019,37500,Petrol,Manual,Red,1.0,Individual,1197.0,FWD,5.0,37.0,83.13734,6000.0,113.0,4200.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],23991.141732,2018,69000,Diesel,Manual,Grey,1.0,Individual,2393.0,RWD,7.0,55.0,150.05276,3400.0,343.0,1400.0


In [None]:
data.describe()

Unnamed: 0,Price (USD),Year,Mileage (km),Owner,Engine Capacity (cc),Seating Capacity,Fuel Tank Capacity,Max Power (hp),Max Power (rpm),Max Torque (Nm),Max Torque (rpm)
count,2059.0,2059.0,2059.0,2038.0,1979.0,1995.0,1946.0,1979.0,1975.0,1979.0,1979.0
mean,20952.16162,2016.425449,54224.71,1.230618,1692.575543,5.306266,52.00221,131.409489,4835.093671,245.851019,2619.545225
std,29772.153487,3.363564,57361.72,0.483203,643.736294,0.82217,15.110198,65.976371,1097.368548,140.465731,1206.31477
min,602.854331,1988.0,0.0,1.0,624.0,2.0,15.0,35.48545,2910.0,48.0,150.0
25%,5967.015256,2014.0,29000.0,1.0,1197.0,5.0,41.25,84.15121,4000.0,115.0,1600.0
50%,10150.098425,2017.0,50000.0,1.0,1498.0,5.0,50.0,117.60892,4200.0,200.0,1900.0
75%,23683.562992,2019.0,72000.0,1.0,1995.0,5.0,60.0,173.37177,6000.0,350.0,4000.0
max,430610.23622,2022.0,2000000.0,5.0,6592.0,8.0,105.0,669.1542,8250.0,780.0,6500.0


In [None]:
reg = setup(data, target='Price (USD)', session_id=123, experiment_name='cars')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Price (USD)
2,Target type,Regression
3,Original data shape,"(2059, 18)"
4,Transformed data shape,"(2059, 45)"
5,Transformed train set shape,"(1441, 45)"
6,Transformed test set shape,"(618, 45)"
7,Numeric features,10
8,Categorical features,7
9,Rows with missing values,10.0%


In [None]:
best_model = compare_models(fold=2)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,8933.9785,478810144.0,21880.7305,0.4568,0.5928,0.5531,1.16
lightgbm,Light Gradient Boosting Machine,11299.5526,590489654.4009,24299.4626,0.33,0.8002,0.9959,0.3
et,Extra Trees Regressor,10861.9375,595436383.5719,24401.3727,0.3244,0.7842,0.9517,0.205
gbr,Gradient Boosting Regressor,11021.7398,600152636.9406,24497.1623,0.319,0.7942,0.9768,0.19
ada,AdaBoost Regressor,11737.7015,645414207.5858,25403.1343,0.2677,0.849,1.072,0.12
rf,Random Forest Regressor,11629.1381,654217926.7622,25577.0322,0.2577,0.8392,1.0501,0.3
dt,Decision Tree Regressor,11909.6628,658256309.4422,25653.7423,0.2531,0.846,1.0745,1.69
lasso,Lasso Regression,12298.9878,663271162.0684,25754.0349,0.2474,0.8955,1.1438,1.09
llar,Lasso Least Angle Regression,12298.8631,663268726.0842,25753.9877,0.2474,0.8955,1.1437,1.085
ridge,Ridge Regression,12300.056,663359750.3274,25755.7597,0.2473,0.8951,1.1438,1.09


In [None]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [None]:
knn_model = create_model('knn')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5788.4873,96094488.0,9802.7793,0.7939,0.4763,0.4466
1,7380.8818,303727744.0,17427.7871,0.5589,0.5703,0.5121
2,11235.6621,810704000.0,28472.8652,0.3541,0.5678,0.4297
3,9983.2197,881151232.0,29684.1914,0.3519,0.6065,0.5511
4,6059.2153,166986528.0,12922.3271,0.7246,0.5246,0.5434
5,5973.6748,91787680.0,9580.5889,0.7535,0.5719,0.589
6,7234.4399,352723168.0,18780.9258,0.5744,0.5474,0.5308
7,6216.6953,117242432.0,10827.8545,0.7273,0.4553,0.4181
8,12507.8262,1410906624.0,37562.0352,0.2833,0.6015,0.4858
9,7396.4316,197816416.0,14064.7227,0.7361,0.5519,0.5371


In [None]:
tuned_knn_model = tune_model(knn_model, n_iter = 50, optimize = 'R2')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5022.201,75673647.4932,8699.0601,0.8377,0.4318,0.4013
1,6953.2895,273822609.1784,16547.5862,0.6024,0.5207,0.48
2,10502.5349,738896740.0227,27182.6551,0.4113,0.5032,0.3589
3,9715.408,850536874.8971,29163.9653,0.3744,0.5601,0.5003
4,5752.0365,142149612.1239,11922.6512,0.7656,0.4926,0.4978
5,5586.8526,88794614.4546,9423.0894,0.7615,0.5242,0.5272
6,6609.024,300634226.3328,17338.8069,0.6372,0.5077,0.5014
7,5846.3038,110668233.4799,10519.897,0.7426,0.4206,0.3758
8,11627.5411,1348605201.7732,36723.3604,0.315,0.5409,0.4019
9,7153.6513,200972143.3718,14176.4644,0.7319,0.4982,0.4746


Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [None]:
evaluate_model(tuned_knn_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelinâ€¦

In [None]:
predictions = predict_model(tuned_knn_model, data=data)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,K Neighbors Regressor,2268.8557,142314849.6847,11929.5788,0.8394,0.2684,0.1297


In [None]:
predictions.head()

Unnamed: 0,Brand,Model,Year,Mileage (km),Fuel Type,Transmission,Color,Owner,Seller Type,Engine Capacity (cc),Drivetrain,Seating Capacity,Fuel Tank Capacity,Max Power (hp),Max Power (rpm),Max Torque (Nm),Max Torque (rpm),Price (USD),prediction_label
0,Honda,Amaze 1.2 VX i-VTEC,2017,87150,Petrol,Manual,Grey,1.0,Corporate,1198.0,FWD,5.0,35.0,88.206688,6000.0,109.0,4500.0,6213.090332,5797.223386
1,Maruti Suzuki,Swift DZire VDI,2014,75000,Diesel,Manual,White,2.0,Individual,1248.0,FWD,5.0,42.0,75.026382,4000.0,190.0,2000.0,5536.41748,5536.41748
2,Hyundai,i10 Magna 1.2 Kappa2,2011,67000,Petrol,Manual,Maroon,1.0,Individual,1197.0,FWD,5.0,35.0,80.095734,6000.0,112.761902,4000.0,2706.692871,2706.692871
3,Toyota,Glanza G,2019,37500,Petrol,Manual,Red,1.0,Individual,1197.0,FWD,5.0,37.0,83.137337,6000.0,113.0,4200.0,9830.216797,9830.216797
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],2018,69000,Diesel,Manual,Grey,1.0,Individual,2393.0,RWD,7.0,55.0,150.052765,3400.0,343.0,1400.0,23991.142578,23991.142578


In [None]:
save_model(tuned_knn_model, model_name = 'deployment')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Year', 'Mileage (km)', 'Owner',
                                              'Engine Capacity (cc)',
                                              'Seating Capacity',
                                              'Fuel Tank Capacity',
                                              'Max Power (hp)',
                                              'Max Power (rpm)',
                                              'Max Torque (Nm)',
                                              'Max Torque (rpm)'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['Brand', 'Model', 'Fuel Ty...
                                                               handle_missing='return_nan',
                                                               use_cat_names=True))),
              