# Applying various Regression Model

###  The model uses lableencoding , XGboost , lightgbm
## Please install  xgboost from PIP 


In [1]:
import numpy as np
import pandas as pd
import glob
import os
import pickle
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import matplotlib.pyplot as plt

## DATA

In this case we are reading used cars data.  With this data we are trying to predict the selling price of car in pounds

#### The features are:
- Model and make of the car 
- Purchase year of the car
- Transmission
- Mileage
- FuelType
- MPG
- EngineSize


In [2]:
path = r'D:\Data_Analytics\XGBModel\UsedCarsValuePredictionML\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
df


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [3]:
# remove unwanted feature
df =df.drop("tax", axis=1)
df['year'] = df['year'].astype(str)
df['model'] = df['model'].str.strip()

In [4]:
# identified outliers
index_list = [9434,10109,7221,7845,17753,14988,14306,33361,22488,43661,44279,62386]

In [5]:
#outliers
Filter_df  = df[df.index.isin(index_list)]
Filter_df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
7221,A5,2020,59995,Semi-Auto,2000,Diesel,40.9,3.0,Audi
7845,A6,2018,59950,Automatic,22000,Petrol,29.4,4.0,Audi
9434,A8,2020,78990,Automatic,250,Diesel,39.2,3.0,Audi
10109,S3,2003,4990,Manual,106000,Petrol,39.8,1.8,Audi
14306,2 Series,2015,123456,Semi-Auto,33419,Diesel,68.9,2.0,BMW
14988,5 Series,2020,54845,Semi-Auto,450,Diesel,60.1,3.0,BMW
17753,3 Series,2020,71990,Semi-Auto,150,Diesel,47.1,3.0,BMW
22488,Focus,2017,38015,Manual,197,Diesel,74.3,1.5,Ford
33361,Focus,2018,54995,Manual,11000,Petrol,36.7,2.3,Ford
43661,I10,2017,92000,Automatic,35460,Petrol,47.9,1.2,Hyundi


In [6]:
#remove outliers
df = df.drop(index_list)

In [7]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,46.3,1.2,Volkswagen


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85543 entries, 0 to 85554
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         85543 non-null  object 
 1   year          85543 non-null  object 
 2   price         85543 non-null  int64  
 3   transmission  85543 non-null  object 
 4   mileage       85543 non-null  int64  
 5   fuelType      85543 non-null  object 
 6   mpg           85543 non-null  float64
 7   engineSize    85543 non-null  float64
 8   make          85543 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.5+ MB


In [9]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,49.6,1.0,Audi


## Preliminary Data Analysis

In [10]:
df.isnull().sum(axis=0)

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
mpg             0
engineSize      0
make            0
dtype: int64

## Train Test Split

In [11]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85543, 8) (85543,)


In [12]:
# split to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
cat_cols = X.columns[X.dtypes == 'O']
num_cols = X.columns[X.dtypes == 'float64']

In [14]:
categories = [
    X[column].unique() for column in X[cat_cols]]

for cat in categories:
    cat[cat == None] = 'missing'  # noqa

In [15]:
categories

[array(['A1', 'A6', 'A4', 'A3', 'Q3', 'Q5', 'A5', 'S4', 'Q2', 'A7', 'TT',
        'Q7', 'RS6', 'RS3', 'A8', 'Q8', 'RS4', 'RS5', 'R8', 'SQ5', 'S8',
        'SQ7', 'S3', 'S5', 'A2', 'RS7', '5 Series', '6 Series', '1 Series',
        '7 Series', '2 Series', '4 Series', 'X3', '3 Series', 'X5', 'X4',
        'i3', 'X1', 'M4', 'X2', 'X6', '8 Series', 'Z4', 'X7', 'M5', 'i8',
        'M2', 'M3', 'M6', 'Z3', 'Fiesta', 'Focus', 'Puma', 'Kuga',
        'EcoSport', 'C-MAX', 'Mondeo', 'Ka+', 'Tourneo Custom', 'S-MAX',
        'B-MAX', 'Edge', 'Tourneo Connect', 'Grand C-MAX', 'KA', 'Galaxy',
        'Mustang', 'Grand Tourneo Connect', 'Fusion', 'Ranger', 'Streetka',
        'Escort', 'Transit Tourneo', 'I20', 'Tucson', 'I10', 'IX35', 'I30',
        'I40', 'Ioniq', 'Kona', 'Veloster', 'I800', 'IX20', 'Santa Fe',
        'Accent', 'Terracan', 'Getz', 'Amica', 'SLK', 'S Class',
        'SL CLASS', 'G Class', 'GLE Class', 'GLA Class', 'A Class',
        'B Class', 'GLC Class', 'C Class', 'E Class', 'GL

In [16]:
cat_proc_lin = make_pipeline(
    SimpleImputer(missing_values=None,
                  strategy='constant',
                  fill_value='missing'),
    OneHotEncoder(categories=categories)
)

num_proc_lin = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
# transformation to use for linear estimators
processor_lin = make_column_transformer(
    (cat_proc_lin, cat_cols),
    (num_proc_lin, num_cols),
    remainder='passthrough')

In [17]:
del df, X,y, cat_cols, num_cols, categories

## EVALUATE FUNCTION

A single function that will evaluate all models 

This will allow us to easily pick out the model we want to move forward with.

This function takes in a model ( pipeline ) and our train test split data. From there it simply performes predictions and generates results

In [18]:
def evaluate(pipeline, X_train, X_test, y_train, y_test):
    '''
    Evaluate a pipeline on training and test datasets
    '''    
    pipeline.fit(X_train, y_train)
    
    test_acc = pipeline.score(X_test, y_test)

    print(f"========== Predictor: {type(pipeline).__name__} ==========")
    print(f"Test result: f1: , acc: {test_acc:.3f}")
    print()


## Pick A Model For A Base Point To Evaluate Other Models Against

In this case we are choosing Logistric Regression

In [19]:
# try XGBRegressor
# evaluate(XGBRegressor(n_jobs=-1), X_train, X_test, y_train, y_test)
XGBrf = Pipeline(steps=[
    ('preprocess', processor_lin ),
    ('regressor',XGBRegressor(n_jobs=-1))
])
evaluate(XGBrf, X_train, X_test, y_train, y_test)

Test result: f1: , acc: 0.953



In [20]:
LGBrf =  Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',LGBMRegressor(n_jobs=-1))
])

evaluate(LGBrf, X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.945



In [21]:
rf =  Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',RandomForestRegressor(n_estimators=140, min_samples_split=5, min_samples_leaf=4, max_features="sqrt", max_depth=20,bootstrap=False))
])
evaluate(rf, X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.900



In [22]:
GBrf =  Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',GradientBoostingRegressor())
])
evaluate(GBrf, X_train, X_test, y_train, y_test)

Test result: f1: , acc: 0.890



## Tuning final model and finding the best parameters for the model

The RandomizedSearchCV function will try all our combinations above and select the most accurate model.  

That best model is found in the best_estimator_ property of the RandomizedSerachCV object. 

In [23]:
XGBF = XGBRegressor(n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBF)
])
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 40, num = 5)]
max_depth = [int(x) for x in np.linspace(2, 18, num = 10)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}

clf2 = GridSearchCV(estimator=clf, param_grid=gbm_param_grid, cv=4, verbose=3)

In [24]:
clf2.fit(X_train, y_train)


Fitting 4 folds for each of 110 candidates, totalling 440 fits
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.824, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.819, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.820, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.820, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25, score=0.835, total=   0.8s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25, score=0.832, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25, score=0.833, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__m

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=20, score=0.904, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=20, score=0.903, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25, score=0.919, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25, score=0.915, total=   0.8s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25, score=0.910, total=   0.8s
[CV] regressor__colsample_bytree=0.3, regressor__m

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=20, score=0.941, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=20, score=0.940, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25, score=0.948, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25, score=0.947, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25, score=0.946, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__m

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=20, score=0.950, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=20, score=0.947, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25, score=0.955, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25, score=0.954, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25, score=0.952, total=   1.2s
[CV] regressor__colsample_bytree=0.3, reg

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=20, score=0.955, total=   1.5s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=20, score=0.952, total=   1.4s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25, score=0.959, total=   1.5s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25, score=0.958, total=   1.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25, score=0.957, total=   1.6s
[CV] regressor__colsample_bytree=0.3, reg

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20, score=0.923, total=   1.0s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20, score=0.921, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20, score=0.917, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25, score=0.930, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25, score=0.928, total=   1.1s
[CV] regressor__colsamp

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.866, total=   0.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.864, total=   0.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.856, total=   0.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.852, total=   0.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=25, score=0.876, total=   0.9s
[CV] regressor__colsample_bytree=0.7, regressor__m

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.935, total=   1.2s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.936, total=   1.3s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.931, total=   1.2s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.931, total=   1.2s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=25, score=0.939, total=   1.4s
[CV] regressor__colsample_bytree=0.7, regressor__m

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.953, total=   1.3s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.956, total=   1.5s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.953, total=   1.4s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.950, total=   1.4s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=25, score=0.955, total=   1.6s
[CV] regressor__colsample_bytree=0.7, reg

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.960, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.962, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.957, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.957, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=25, score=0.961, total=   2.0s
[CV] regressor__colsample_bytree=0.7, reg

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.962, total=   2.7s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.962, total=   2.6s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.959, total=   2.5s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.958, total=   2.7s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=25, score=0.963, total=   2.9s
[CV] regressor__colsample_bytree=0.7, reg

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=None, regressor__n_estimators=40, score=0.933, total=   1.5s


[Parallel(n_jobs=1)]: Done 440 out of 440 | elapsed: 11.2min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                  

In [25]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", clf2.best_params_)
print("Best Score found: ", np.sqrt(np.abs(clf2.best_score_)))

Best parameters found:  {'regressor__colsample_bytree': 0.7, 'regressor__max_depth': 16, 'regressor__n_estimators': 40}
Best Score found:  0.980431134859264


In [26]:
best_random = clf2.best_estimator_
predictions = best_random.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
clf2.best_params_
    

Model Performance
Average Error: 1198.8742 degrees.
Accuracy = 92.41%.


{'regressor__colsample_bytree': 0.7,
 'regressor__max_depth': 16,
 'regressor__n_estimators': 40}

In [27]:
## Using RandomizedSearchCV for best score

XGBF = XGBRegressor(n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBF)
])
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 15)]
max_depth = [int(x) for x in np.linspace(2, 30, num = 20)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}

randomized_acc = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, n_iter=20, cv=4, verbose=3)

# Fit randomized_mse to the data
randomized_acc.fit(X_train, y_train)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_acc.best_params_)
print("Best Score found: ", np.sqrt(np.abs(randomized_acc.best_score_)))

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV] regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7, score=0.962, total=   3.0s
[CV] regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV]  regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7, score=0.963, total=   3.0s
[CV] regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV]  regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7, score=0.959, total=   2.8s
[CV] regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=29, regressor__max_depth=16, regressor__colsample_bytree=0.7, score=0.959, total=   2.8s
[CV] regressor__n_estimators=131, regressor__max_depth=10, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=131, regressor__max_depth=10, regressor__colsample_bytree=0.7, score=0.963, total=   6.1s
[CV] regressor__n_estimators=131, regressor__max_depth=10, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=131, regressor__max_depth=10, regressor__colsample_bytree=0.7, score=0.966, total=   6.1s
[CV] regressor__n_estimators=131, regressor__max_depth=10, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=131, regressor__max_depth=10, regressor__colsample_bytree=0.7, score=0.962, total=   6.1s
[CV] regressor__n_estimators=131, r

[CV]  regressor__n_estimators=85, regressor__max_depth=25, regressor__colsample_bytree=0.3, score=0.962, total=   8.2s
[CV] regressor__n_estimators=85, regressor__max_depth=25, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=85, regressor__max_depth=25, regressor__colsample_bytree=0.3, score=0.961, total=   7.4s
[CV] regressor__n_estimators=85, regressor__max_depth=25, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=85, regressor__max_depth=25, regressor__colsample_bytree=0.3, score=0.958, total=   7.8s
[CV] regressor__n_estimators=57, regressor__max_depth=24, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=57, regressor__max_depth=24, regressor__colsample_bytree=0.7, score=0.960, total=   7.8s
[CV] regressor__n_estimators=57, regressor__max_depth=24, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=57, regressor__max_depth=24, regressor__colsample_bytree=0.7, score=0.961, total=   7.9s
[CV] regressor__n_estimators=57, regresso

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 11.9min finished


Best parameters found:  {'regressor__n_estimators': 131, 'regressor__max_depth': 10, 'regressor__colsample_bytree': 0.7}
Best Score found:  0.9812864156759563


In [28]:
## Using RandomizedSearchCV for mean squared error

XGBF = XGBRegressor(n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBF)
])
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 15)]
max_depth = [int(x) for x in np.linspace(2, 30, num = 20)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=3)
# Fit randomized_mse to the data
randomized_mse.fit(X_train, y_train)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7, score=-3971719.868, total=  14.0s
[CV] regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s remaining:    0.0s


[CV]  regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7, score=-4292266.657, total=  11.3s
[CV] regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.2s remaining:    0.0s


[CV]  regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7, score=-4249608.513, total=  12.5s
[CV] regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=66, regressor__max_depth=27, regressor__colsample_bytree=0.7, score=-4407589.002, total=  13.0s
[CV] regressor__n_estimators=57, regressor__max_depth=13, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=57, regressor__max_depth=13, regressor__colsample_bytree=0.7, score=-3630335.528, total=   5.3s
[CV] regressor__n_estimators=57, regressor__max_depth=13, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=57, regressor__max_depth=13, regressor__colsample_bytree=0.7, score=-3948448.678, total=   4.5s
[CV] regressor__n_estimators=57, regressor__max_depth=13, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=57, regressor__max_depth=13, regressor__colsample_bytree=0.7, score=-3993099.441, total=   4.5s
[CV] r

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  2.1min finished


Best parameters found:  {'regressor__n_estimators': 57, 'regressor__max_depth': 13, 'regressor__colsample_bytree': 0.7}
Lowest RMSE found:  1980.8383386122337


# Evaluate Our Model Further

Now we are going to shuffle the data over and over and apply our new model to the results to further determine if we want to use this model.

In [29]:
path = r'D:\Data_Analytics\XGBModel\UsedCarsValuePredictionML\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
# remove unwanted feature
df =df.drop("tax", axis=1)
df['year'] = df['year'].astype(str)
df['model'] = df['model'].str.strip()

In [30]:
# identified outliers
index_list = [9434,10109,7221,7845,17753,14988,14306,33361,22488,43661,44279,62386]
#remove outliers
df = df.drop(index_list)

In [31]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85543, 8) (85543,)


In [32]:
# evaluate model with kfold
kfold = KFold(n_splits=10)
results = cross_val_score(randomized_mse.best_estimator_, X, y, cv=kfold, n_jobs=-1)
print("Results: %.2f (%.2f) accuracy" % (results.mean(), results.std()))

Results: 0.89 (0.07) accuracy


## Save The Model For Future Use

In [33]:
# save model
with open(f'best_xgb_model.pickle', 'wb') as f:
    pickle.dump(randomized_mse.best_estimator_, f)

## Test the model

In [34]:
values = ['Fox',2008,'Manual', 88102,'Petrol',46.3,1.2,'Volkswagen']


In [35]:
features = [np.array(values)]

In [36]:
df_deploy = pd.DataFrame(features,columns=['model','year','transmission','mileage','fuelType','mpg','engineSize','make'])
df_deploy

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,make
0,Fox,2008,Manual,88102,Petrol,46.3,1.2,Volkswagen


In [37]:
randomized_mse.predict(df_deploy)

array([1853.6312], dtype=float32)

In [38]:
predictions = best_random.predict(X_test[:15])

In [39]:
predictions

array([ 9544.972 ,  3385.7935, 12618.46  , 14504.157 , 18482.67  ,
       23891.766 , 17772.76  , 18916.656 , 34410.508 ,  8342.992 ,
       19615.963 , 24243.139 , 53981.31  , 10876.209 , 13859.147 ],
      dtype=float32)

In [40]:
y_test[:15].values

array([ 8998,  2495, 10490, 13995, 19690, 23485, 16700, 19470, 35444,
        7490, 21495, 26699, 57970, 11500, 16000], dtype=int64)