In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=10-R6GyVWjt_gjWEFD86mKHDvSWD9lp1z" )

In [3]:
cars_df.sample(5)

Unnamed: 0,index,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,age,KM_Driven,make,mileage_new,engine_new,power_new
2553,4990,Maruti Swift Dzire VDI,Pune,2015,131000,Diesel,Manual,First,26.59 kmpl,1248 CC,74 bhp,5.0,,3.6,4,131,maruti,26.59,1248.0,74.0
1787,3487,Hyundai i20 Sportz Option 1.2,Coimbatore,2016,44238,Petrol,Manual,First,18.6 kmpl,1197 CC,81.83 bhp,5.0,,6.72,3,44,hyundai,18.6,1197.0,81.83
1057,2087,Maruti Ciaz ZDi Plus SHVS,Hyderabad,2016,46000,Diesel,Manual,First,28.09 kmpl,1248 CC,88.5 bhp,5.0,,8.45,3,46,maruti,28.09,1248.0,88.5
1244,2485,Maruti Vitara Brezza ZDi Plus Dual Tone,Kochi,2017,45362,Diesel,Manual,First,24.3 kmpl,1248 CC,88.5 bhp,5.0,11.75 Lakh,9.38,2,45,maruti,24.3,1248.0,88.5
1381,2734,Hyundai Grand i10 Sportz,Jaipur,2017,23000,Petrol,Manual,First,18.9 kmpl,1197 CC,82 bhp,5.0,,4.5,2,23,hyundai,18.9,1197.0,82.0


In [4]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              3092 non-null   int64  
 1   Name               3092 non-null   object 
 2   Location           3092 non-null   object 
 3   Year               3092 non-null   int64  
 4   Kilometers_Driven  3092 non-null   int64  
 5   Fuel_Type          3092 non-null   object 
 6   Transmission       3092 non-null   object 
 7   Owner_Type         3092 non-null   object 
 8   Mileage            3092 non-null   object 
 9   Engine             3092 non-null   object 
 10  Power              3092 non-null   object 
 11  Seats              3091 non-null   float64
 12  New_Price          411 non-null    object 
 13  Price              3092 non-null   float64
 14  age                3092 non-null   int64  
 15  KM_Driven          3092 non-null   int64  
 16  make               3092 

# Feature Set Selection

In [5]:
cars_df.columns

Index(['index', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price', 'Price', 'age', 'KM_Driven', 'make', 'mileage_new',
       'engine_new', 'power_new'],
      dtype='object')

In [6]:
x_features = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats', 
              'make', 'mileage_new', 'engine_new', 
              'power_new', 'Location']

In [7]:
cat_vars = ['Fuel_Type', 
                'Transmission', 'Owner_Type',
                'make', 'Location']

In [8]:
num_vars = list(set(x_features) - set(cat_vars))

In [9]:
num_vars

['KM_Driven', 'engine_new', 'Seats', 'power_new', 'age', 'mileage_new']

In [10]:
cars_df[x_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   KM_Driven     3092 non-null   int64  
 1   Fuel_Type     3092 non-null   object 
 2   age           3092 non-null   int64  
 3   Transmission  3092 non-null   object 
 4   Owner_Type    3092 non-null   object 
 5   Seats         3091 non-null   float64
 6   make          3092 non-null   object 
 7   mileage_new   3092 non-null   float64
 8   engine_new    3092 non-null   float64
 9   power_new     3092 non-null   float64
 10  Location      3092 non-null   object 
dtypes: float64(4), int64(2), object(5)
memory usage: 265.8+ KB


# Setting X and y variables

In [11]:
X = cars_df[x_features]
y = cars_df['Price']

# Data Splitting


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [14]:
X_train.shape

(2473, 11)

In [15]:
X_test.shape

(619, 11)

# Defining Transformation 

1-Data imputation for Seats Column
Mean imputation
2-Categorical Encoding for categorical columns
OHE Encoding
3-Data scaling
Standard scaling

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
imputed_num_vars = ['Seats']

In [18]:
imputed_num_vars

['Seats']

In [19]:
non_imputed_num_vars = list(set(num_vars) - set(imputed_num_vars))

In [20]:
non_imputed_num_vars

['KM_Driven', 'engine_new', 'power_new', 'age', 'mileage_new']

In [21]:
mean_imputer = SimpleImputer(strategy='mean')

# Encode Categorical Variables

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')

# Scaling Numerical Vars

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Creating Pipelines

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [26]:
imputed_num_transformer = Pipeline( steps = [  
        ('imputation', mean_imputer),
        ('scaler', scaler)])

In [27]:
non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])

In [28]:
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

In [29]:
preprocessor = ColumnTransformer(
    transformers=[  
        ('num_imputed', imputed_num_transformer, imputed_num_vars),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_vars),
        ('catvars', cat_transformer, cat_vars)])

# KNN (K-Nearest Neighbor)

In [30]:
from sklearn.neighbors import KNeighborsRegressor

In [31]:
#knn = KNeighborsRegressor(n_neighbors=20)
knn = KNeighborsRegressor(n_neighbors=20, weights='distance')

In [32]:
knn_v1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', knn)])

In [33]:
knn_v1.fit(X_train, y_train)

In [34]:
from sklearn import set_config
set_config(display='diagram') 

In [35]:
knn_v1

# K Fold Cross Validation

In [36]:
from sklearn.model_selection import cross_val_score

In [37]:
scores = cross_val_score( knn_v1,
                          X_train,
                          y_train,
                          cv = 10,
                          scoring = 'r2')

In [38]:
scores


array([0.80997331, 0.74165817, 0.81740538, 0.81873032, 0.78002919,
       0.81578809, 0.80065021, 0.77858816, 0.80501332, 0.81670127])

In [39]:
scores.mean()

0.7984537415417394

In [40]:
scores.std()

0.023546023018284762

# Grid Search

In [41]:
from sklearn.model_selection import GridSearchCV

In [42]:
knn_params = { "knn__n_neighbors": [5, 10, 15, 20, 25],
               "knn__weights": ['uniform', 'distance'],
               "knn__metric": ['minkowski', 'euclidean']}

In [43]:
knn_grid_v1 = GridSearchCV(knn_v1,
                           param_grid=knn_params,
                           cv = 10,
                           scoring = 'r2')



In [44]:
knn_grid_v1.fit(X_train, y_train)

In [45]:
knn_grid_v1.best_params_

{'knn__metric': 'minkowski',
 'knn__n_neighbors': 10,
 'knn__weights': 'distance'}

In [46]:
knn_grid_v1.best_score_

0.8152344413594086

In [47]:
knn_grid_results = pd.DataFrame( knn_grid_v1.cv_results_ )
knn_grid_results[['param_knn__n_neighbors', 'param_knn__weights', 'mean_test_score', 'std_test_score']]

Unnamed: 0,param_knn__n_neighbors,param_knn__weights,mean_test_score,std_test_score
0,5,uniform,0.795774,0.029372
1,5,distance,0.808339,0.026951
2,10,uniform,0.799717,0.024221
3,10,distance,0.815234,0.024535
4,15,uniform,0.787124,0.024105
5,15,distance,0.808179,0.023971
6,20,uniform,0.773483,0.023387
7,20,distance,0.798454,0.023546
8,25,uniform,0.767427,0.023019
9,25,distance,0.794196,0.022889


# Building the final result


In [48]:
final_model = KNeighborsRegressor(n_neighbors = knn_grid_v1.best_params_['knn__n_neighbors'], 
                                  weights = knn_grid_v1.best_params_['knn__weights'], 
                                  metric = knn_grid_v1.best_params_['knn__metric'])
knn_final = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', final_model)])

In [50]:
knn_final.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
final_rmse = np.sqrt(mean_squared_error(y_test, knn_final.predict(X_test)))
final_rmse

In [55]:
from sklearn.metrics import mean_squared_error

# Model Persistence

In [57]:
class CarPredictionModel():
    
    def __init__(self, model, features, rmse):
        self.model = model
        self.features = features
        self.rmse = rmse

In [None]:
my_model = CarPredictionModel(knn_final, list(X_train.columns), final_rmse)

In [59]:
from joblib import dump

In [None]:
dump(my_model, './cars_v1.pkl')