**Importing Libraries**

In [193]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,\
mean_squared_error




In [154]:
#Load dataset
data=pd.read_csv("car.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [155]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         15411 non-null  int64  
 1   car_name           15411 non-null  object 
 2   brand              15411 non-null  object 
 3   model              15411 non-null  object 
 4   vehicle_age        15411 non-null  int64  
 5   km_driven          15411 non-null  int64  
 6   seller_type        15411 non-null  object 
 7   fuel_type          15411 non-null  object 
 8   transmission_type  15411 non-null  object 
 9   mileage            15411 non-null  float64
 10  engine             15411 non-null  int64  
 11  max_power          15411 non-null  float64
 12  seats              15411 non-null  int64  
 13  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 1.6+ MB


In [156]:
data.describe()

Unnamed: 0.1,Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,selling_price
count,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0
mean,9811.857699,6.036338,55616.48,19.701151,1486.057751,100.588254,5.325482,774971.1
std,5643.418542,3.013291,51618.55,4.171265,521.106696,42.972979,0.807628,894128.4
min,0.0,0.0,100.0,4.0,793.0,38.4,0.0,40000.0
25%,4906.5,4.0,30000.0,17.0,1197.0,74.0,5.0,385000.0
50%,9872.0,6.0,50000.0,19.67,1248.0,88.5,5.0,556000.0
75%,14668.5,8.0,70000.0,22.7,1582.0,117.3,5.0,825000.0
max,19543.0,29.0,3800000.0,33.54,6592.0,626.0,9.0,39500000.0


**Data Cleaning**



Handling Missing Values


*   Handling missing values
*   Handling Duplicates
*   Check data type
*   Understanding of dataset




In [157]:
#check null values

data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
car_name,0
brand,0
model,0
vehicle_age,0
km_driven,0
seller_type,0
fuel_type,0
transmission_type,0
mileage,0


In [158]:
#Drop unnecessary columns

data.drop("car_name",axis=1,inplace=True)

data.drop("brand",axis=1,inplace=True)



In [159]:
data.drop("Unnamed: 0",axis=1,inplace=True)


In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              15411 non-null  object 
 1   vehicle_age        15411 non-null  int64  
 2   km_driven          15411 non-null  int64  
 3   seller_type        15411 non-null  object 
 4   fuel_type          15411 non-null  object 
 5   transmission_type  15411 non-null  object 
 6   mileage            15411 non-null  float64
 7   engine             15411 non-null  int64  
 8   max_power          15411 non-null  float64
 9   seats              15411 non-null  int64  
 10  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.3+ MB


In [161]:
data.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [162]:
data["model"].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [163]:
#Numerical features

numerical_features=[feature for feature in data.columns if
                    data[feature].dtype!="object"]
numerical_features

['vehicle_age',
 'km_driven',
 'mileage',
 'engine',
 'max_power',
 'seats',
 'selling_price']

In [164]:
#Categorical features

categorical_features=[feature for feature in data.columns if
                      data[feature].dtype=="object"]
categorical_features


['model', 'seller_type', 'fuel_type', 'transmission_type']

In [165]:
#Discrete features

discrete_features=[feature for feature in numerical_features if
                   len(data[feature].unique())<=25]

discrete_features

['vehicle_age', 'seats']

In [166]:
#continuos features

continuos_features=[feature for feature in numerical_features if
                    feature not in discrete_features]

continuos_features


['km_driven', 'mileage', 'engine', 'max_power', 'selling_price']

In [167]:
#Independent and dependent features

X=data.drop(["selling_price"],axis=1)
y=data["selling_price"]


In [168]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [169]:
X.shape,y.shape

((15411, 10), (15411,))

In [170]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [171]:
y.head()

Unnamed: 0,selling_price
0,120000
1,550000
2,215000
3,226000
4,570000


**Feature Encoding and Scaling**

-> One Hot Encoding for columns which had lesser unique values and not ordinal

->One hot encoding is a process by which categorical variable are converted into a form that could be provided to ML algorithms to do a better job in prediction.


In [172]:
label= LabelEncoder()

X['model']=label.fit_transform(X["model"])

In [173]:
X['model'].head()

Unnamed: 0,model
0,7
1,54
2,118
3,7
4,38


In [174]:
print("Numerical feature:- ",numerical_features)
print("Categorical feature:- ",categorical_features)

Numerical feature:-  ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']
Categorical feature:-  ['model', 'seller_type', 'fuel_type', 'transmission_type']


In [175]:
#column transformer with 3 types of transformers
num_features=X.select_dtypes(exclude="object").columns
print(num_features)

onhot_features=['seller_type', 'fuel_type', 'transmission_type']

label_encoder_columns=["model"]

Index(['model', 'vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power',
       'seats'],
      dtype='object')


In [176]:
numeric_scaler=StandardScaler()
OneHot_transformer=OneHotEncoder(drop="first")

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",OneHot_transformer,onhot_features),
        ("StandardScaler",numeric_scaler,num_features)
    ],remainder="passthrough"
)
#remainder="passthrough":- other than categorical features keep it as it is.



In [177]:
X=preprocessor.fit_transform(X)



In [178]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [179]:
#divide data in train and test

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                            random_state=10)

In [180]:
X_train.shape,X_test.shape

((12328, 14), (3083, 14))

**Model Training and Model Selection**

1. RandomForestRegressor
2. LinearRegression
3. Ridge
4. Lasso
5. KNeighborsRegressor
6. DecisionTreeRegressor



In [184]:
#create a function to evaluate model

def evaluate_model(true,predicted):

    mae=mean_absolute_error(true,predicted)

    mse=mean_squared_error(true,predicted)

    rmse=np.sqrt(mean_squared_error(true,predicted))

    r2_square=r2_score(true,predicted)

    return mae,rmse,r2_square


In [188]:
#Model Training

models={

    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor()

}

for i in range(len(list(models))):

    model=list(models.values())[i]
    model.fit(X_train,y_train) #train the model

    # Make Predictions on train dataset

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Evaluate Train and Test dataset

    model_train_mae,model_train_rmse,model_train_r2=evaluate_model(y_train,y_train_pred)

    model_test_mae,model_test_rmse,model_test_r2=evaluate_model(y_test,y_test_pred)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    print(list(models.keys())[i])

    print("Model performace for Training set")
    print("-> Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("-> Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("-> R2 Score:- {:.4f}".format(model_train_r2))

    print("~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ")

    print("Model performace for Testing set")
    print("-> Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("-> Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("-> R2 Score:- {:.4f}".format(model_test_r2))

    print("*"*32)
    print("\n")


Linear Regression
Model performace for Training set
-> Root Mean Squared Error: 478801.8476
-> Mean Absolute Error: 260031.9512
-> R2 Score:- 0.6763
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
Model performace for Testing set
-> Root Mean Squared Error: 753414.8310
-> Mean Absolute Error: 280915.9804
-> R2 Score:- 0.5123
********************************


Lasso
Model performace for Training set
-> Root Mean Squared Error: 478801.8527
-> Mean Absolute Error: 260030.0543
-> R2 Score:- 0.6763
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
Model performace for Testing set
-> Root Mean Squared Error: 753414.0977
-> Mean Absolute Error: 280914.8656
-> R2 Score:- 0.5123
********************************


Ridge
Model performace for Training set
-> Root Mean Squared Error: 478802.5439
-> Mean Absolute Error: 260001.4870
-> R2 Score:- 0.6763
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
Model performace for Testing set
-> Root Mean Squared Error: 753427.8974
-> Mean Absolute Error: 280894.89

In [201]:
#Parameter tuning for K-Neighbors regressor and Random Forest Regressor

knn_params={"n_neighbors":[k for k in range(1,50)]}

random_forest_params={
    "max_depth":[5,8,15,None,10],
    "max_features":[5,7,"auto",8],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,300,400,500,1000]
}




In [202]:
randomcv_models=[

        ("KNN",KNeighborsRegressor(),knn_params),
        ("Random Forest",RandomForestRegressor(),random_forest_params)

]


In [203]:
model_params={}

for name,model,params in randomcv_models:

    random=RandomizedSearchCV(estimator=model,
                              param_distributions=params,
                              n_iter=100,
                              cv=3,
                              verbose=2,
                              n_jobs=-1)
    random.fit(X_train,y_train)
    model_params[name]=random.best_params_


for model_name in model_params:
    print(f"------------Best parameter for {model_name}")
    print(model_params[model_name])


Fitting 3 folds for each of 49 candidates, totalling 147 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------Best parameter for KNN
{'n_neighbors': 2}
------------Best parameter for Random Forest
{'n_estimators': 500, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}


In [205]:
#Model Training

models={

    "K-Neighbors Regressor":KNeighborsRegressor(n_neighbors=2),

    "Random Forest Regressor":RandomForestRegressor(n_estimators=500,
                                                    min_samples_split= 2,
                                                    max_features=7,
                                                    max_depth=None)

}

for i in range(len(list(models))):

    model=list(models.values())[i]
    model.fit(X_train,y_train) #train the model

    # Make Predictions on train dataset

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Evaluate Train and Test dataset

    model_train_mae,model_train_rmse,model_train_r2=evaluate_model(y_train,y_train_pred)

    model_test_mae,model_test_rmse,model_test_r2=evaluate_model(y_test,y_test_pred)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    print(list(models.keys())[i])

    print("Model performace for Training set")
    print("-> Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("-> Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("-> R2 Score:- {:.4f}".format(model_train_r2))

    print("~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ")

    print("Model performace for Testing set")
    print("-> Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("-> Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("-> R2 Score:- {:.4f}".format(model_test_r2))

    print("*"*32)
    print("\n")


K-Neighbors Regressor
Model performace for Training set
-> Root Mean Squared Error: 171346.4704
-> Mean Absolute Error: 62537.1867
-> R2 Score:- 0.9585
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
Model performace for Testing set
-> Root Mean Squared Error: 553096.5205
-> Mean Absolute Error: 120942.7506
-> R2 Score:- 0.7371
********************************


Random Forest Regressor
Model performace for Training set
-> Root Mean Squared Error: 88990.7140
-> Mean Absolute Error: 37812.3961
-> R2 Score:- 0.9888
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
Model performace for Testing set
-> Root Mean Squared Error: 533167.8824
-> Mean Absolute Error: 106976.6189
-> R2 Score:- 0.7557
********************************


