# Applying various Regression Model

###  The model uses lableencoding , XGboost , lightgbm
## Please install  xgboost from anaconda env


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
import pickle
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## DATA

In this case we are reading used cars data.  With this data we are trying to predict the selling price of car in pounds

#### The features are:
- Model and make of the car 
- Purchase year of the car
- Transmission
- Mileage
- FuelType
- MPG
- EngineSize


In [2]:
path = r'D:\RiceBootCamp\Homework\Oct 22\Project3\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
df


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [3]:
# remove unwanted feature
df =df.drop("tax", axis=1)

## Scale the numerical features

In [4]:
ss = preprocessing.StandardScaler()
df[['mileage', 'mpg', 'engineSize', 'year']] = ss.fit_transform(df[['mileage', 'mpg', 'engineSize', 'year']])


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85555 entries, 0 to 85554
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         85555 non-null  object 
 1   year          85555 non-null  float64
 2   price         85555 non-null  int64  
 3   transmission  85555 non-null  object 
 4   mileage       85555 non-null  float64
 5   fuelType      85555 non-null  object 
 6   mpg           85555 non-null  float64
 7   engineSize    85555 non-null  float64
 8   make          85555 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 5.9+ MB


In [6]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
0,A1,-0.050914,12500,Manual,-0.340355,Petrol,-0.020518,-0.517336,Audi
1,A6,-0.521019,16500,Automatic,0.620029,Diesel,0.502066,0.508836,Audi
2,A1,-0.521019,11000,Manual,0.326443,Petrol,-0.020518,-0.517336,Audi
3,A4,-0.050914,16800,Automatic,0.13904,Diesel,0.686158,0.508836,Audi
4,A3,0.889294,17300,Manual,-0.984912,Petrol,-0.364948,-1.201451,Audi


## Encode the catagorical data

In [7]:
df['model'] = df['model'].str.strip()

In [8]:

le = preprocessing.LabelEncoder()
for column_name in df.columns:
    if(df[column_name].dtype == object):
        df[column_name] = le.fit_transform(df[column_name])
        # Step 2: Convert encoded labels to one-hot-encoding
        df[column_name] = to_categorical(df[column_name])

    else:
        pass

In [9]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
0,0.0,-0.050914,12500,0.0,-0.340355,0.0,-0.020518,-0.517336,1.0
1,0.0,-0.521019,16500,1.0,0.620029,1.0,0.502066,0.508836,1.0
2,0.0,-0.521019,11000,0.0,0.326443,0.0,-0.020518,-0.517336,1.0
3,0.0,-0.050914,16800,1.0,0.13904,1.0,0.686158,0.508836,1.0
4,0.0,0.889294,17300,0.0,-0.984912,0.0,-0.364948,-1.201451,1.0


## Preliminary Data Analysis

In [10]:
df.isnull().sum(axis=0)

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
mpg             0
engineSize      0
make            0
dtype: int64

## Train Test Split

In [11]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85555, 8) (85555,)


In [12]:
# split to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
del df,le

In [14]:
del X,y

In [15]:
X_train.shape

(68444, 8)

In [16]:
X_train

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,make
11514,0.0,0.419190,0.0,-0.370478,1.0,0.187328,0.508836,0.0
11986,0.0,0.889294,0.0,-0.834811,0.0,-1.041932,0.508836,0.0
80902,0.0,-0.050914,0.0,-0.492145,1.0,-0.364948,0.508836,0.0
9497,0.0,0.889294,0.0,-0.998566,1.0,-0.364948,-0.175279,1.0
80198,0.0,-0.050914,1.0,-0.083179,0.0,-0.317440,0.166779,0.0
...,...,...,...,...,...,...,...,...
6265,0.0,0.889294,0.0,-0.970366,0.0,-1.261654,0.508836,1.0
54886,0.0,1.359398,1.0,-1.077722,1.0,0.116067,0.508836,0.0
76820,0.0,0.889294,0.0,-0.784511,1.0,0.116067,0.508836,0.0
860,0.0,1.359398,0.0,-1.052572,0.0,-0.465902,-1.201451,1.0


## EVALUATE FUNCTION

A single function that will evaluate all models 

This will allow us to easily pick out the model we want to move forward with.

This function takes in a model ( pipeline ) and our train test split data. From there it simply performes predictions and generates results

In [17]:
def evaluate(pipeline, X_train, X_test, y_train, y_test):
    '''
    Evaluate a pipeline on training and test datasets
    '''    
    pipeline.fit(X_train, y_train)
    
    test_acc = pipeline.score(X_test, y_test)

    print(f"========== Predictor: {type(pipeline).__name__} ==========")
    print(f"Test result: f1: , acc: {test_acc:.3f}")
    print()


## Pick A Model For A Base Point To Evaluate Other Models Against

In this case we are choosing Logistric Regression

In [18]:
# # try LogisticRegression to establish a baseline performance
# pipeline = Pipeline([
#     ('lgr', LogisticRegression()),
# ])
# evaluate(pipeline, X_train, X_test, y_train, y_test)

In [19]:
# try XGBRegressor
evaluate(XGBRegressor(n_jobs=-1), X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.909



In [20]:
evaluate(LGBMRegressor(n_jobs=-1), X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.896



In [21]:
evaluate(RandomForestRegressor(n_estimators=140, min_samples_split=5, min_samples_leaf=4, max_features="sqrt", max_depth=20,bootstrap=False), X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.908



In [22]:
evaluate(GradientBoostingRegressor(), X_train, X_test, y_train, y_test)

Test result: f1: , acc: 0.861



In [23]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

## Tuning Final Model

In [24]:
# param_grid = {
#     'n_estimators': [400, 700, 1000],
#     'colsample_bytree': [0.7, 0.8],
#     'max_depth': [15,20,25],
#     'reg_alpha': [1.1, 1.2, 1.3],
#     'reg_lambda': [1.1, 1.2, 1.3],
#     'subsample': [0.7, 0.8, 0.9]
# }

## Finding the best parameters for the model

The RandomizedSearchCV function will try all our combinations above and select the most accurate model.  

That best model is found in the best_estimator_ property of the RandomizedSerachCV object. 

In [25]:
# model = XGBRegressor()
# # xgb_grid = GridSearchCV(predictor,parameters,cv = 2,n_jobs = 5,verbose=True)
# # xgb_grid.fit(X_train, y_train)
# # # evaluate(rs.best_estimator_, X_train, X_test, y_train, y_test)

# # print(xgb_grid.best_score_)
# # print(xgb_grid.best_params_)

# model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
#                                  param_grid, cv=5)

# # Root Mean Squared Error
# print(np.sqrt(-model.best_score_))
# print(model.best_params_)

In [40]:
clf = XGBRegressor()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 40, num = 5)]
max_depth = [int(x) for x in np.linspace(2, 18, num = 10)]
max_depth.append(None)
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': n_estimators,
    'max_depth': max_depth
}

clf2 = GridSearchCV(estimator=clf, param_grid=gbm_param_grid, cv=4, verbose=3)

In [41]:
clf2.fit(X_train, y_train)


Fitting 4 folds for each of 110 candidates, totalling 440 fits
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=20 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=20, score=0.762, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=20 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=20, score=0.770, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=20 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=20, score=0.776, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=20 ..............

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s



[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=20, score=0.770, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=25 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=25, score=0.785, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=25 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=25, score=0.792, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=25 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=25, score=0.794, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=25 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=25, score=0.788, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=30 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=30, score=0.792, total=   0.2s
[CV] colsample_bytree=0.3, max_depth=2, n_estimators=30 ..............
[CV]  colsample_bytree=0.3, max_depth=2, n_estimators=30, score

[CV]  colsample_bytree=0.3, max_depth=5, n_estimators=40, score=0.864, total=   0.4s
[CV] colsample_bytree=0.3, max_depth=5, n_estimators=40 ..............
[CV]  colsample_bytree=0.3, max_depth=5, n_estimators=40, score=0.864, total=   0.4s
[CV] colsample_bytree=0.3, max_depth=5, n_estimators=40 ..............
[CV]  colsample_bytree=0.3, max_depth=5, n_estimators=40, score=0.868, total=   0.4s
[CV] colsample_bytree=0.3, max_depth=5, n_estimators=40 ..............
[CV]  colsample_bytree=0.3, max_depth=5, n_estimators=40, score=0.856, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=7, n_estimators=20 ..............
[CV]  colsample_bytree=0.3, max_depth=7, n_estimators=20, score=0.848, total=   0.3s
[CV] colsample_bytree=0.3, max_depth=7, n_estimators=20 ..............
[CV]  colsample_bytree=0.3, max_depth=7, n_estimators=20, score=0.852, total=   0.3s
[CV] colsample_bytree=0.3, max_depth=7, n_estimators=20 ..............
[CV]  colsample_bytree=0.3, max_depth=7, n_estimators=20, score=

[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=30, score=0.865, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=10, n_estimators=30 .............
[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=30, score=0.865, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=10, n_estimators=30 .............
[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=30, score=0.853, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=10, n_estimators=35 .............
[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=35, score=0.866, total=   0.6s
[CV] colsample_bytree=0.3, max_depth=10, n_estimators=35 .............
[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=35, score=0.869, total=   0.6s
[CV] colsample_bytree=0.3, max_depth=10, n_estimators=35 .............
[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=35, score=0.869, total=   0.6s
[CV] colsample_bytree=0.3, max_depth=10, n_estimators=35 .............
[CV]  colsample_bytree=0.3, max_depth=10, n_estimators=35,

[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=20, score=0.843, total=   0.6s
[CV] colsample_bytree=0.3, max_depth=16, n_estimators=20 .............
[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=20, score=0.835, total=   0.6s
[CV] colsample_bytree=0.3, max_depth=16, n_estimators=25 .............
[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=25, score=0.848, total=   0.7s
[CV] colsample_bytree=0.3, max_depth=16, n_estimators=25 .............
[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=25, score=0.853, total=   0.7s
[CV] colsample_bytree=0.3, max_depth=16, n_estimators=25 .............
[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=25, score=0.851, total=   0.7s
[CV] colsample_bytree=0.3, max_depth=16, n_estimators=25 .............
[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=25, score=0.841, total=   0.7s
[CV] colsample_bytree=0.3, max_depth=16, n_estimators=30 .............
[CV]  colsample_bytree=0.3, max_depth=16, n_estimators=30,

[CV]  colsample_bytree=0.3, max_depth=None, n_estimators=35, score=0.855, total=   0.4s
[CV] colsample_bytree=0.3, max_depth=None, n_estimators=40 ...........
[CV]  colsample_bytree=0.3, max_depth=None, n_estimators=40, score=0.870, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=None, n_estimators=40 ...........
[CV]  colsample_bytree=0.3, max_depth=None, n_estimators=40, score=0.870, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=None, n_estimators=40 ...........
[CV]  colsample_bytree=0.3, max_depth=None, n_estimators=40, score=0.873, total=   0.5s
[CV] colsample_bytree=0.3, max_depth=None, n_estimators=40 ...........
[CV]  colsample_bytree=0.3, max_depth=None, n_estimators=40, score=0.861, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=2, n_estimators=20 ..............
[CV]  colsample_bytree=0.7, max_depth=2, n_estimators=20, score=0.811, total=   0.2s
[CV] colsample_bytree=0.7, max_depth=2, n_estimators=20 ..............
[CV]  colsample_bytree=0.7, max_depth=2, n_estima

[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=30, score=0.881, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=5, n_estimators=30 ..............
[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=30, score=0.885, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=5, n_estimators=30 ..............
[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=30, score=0.885, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=5, n_estimators=30 ..............
[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=30, score=0.878, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=5, n_estimators=35 ..............
[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=35, score=0.882, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=5, n_estimators=35 ..............
[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=35, score=0.887, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=5, n_estimators=35 ..............
[CV]  colsample_bytree=0.7, max_depth=5, n_estimators=35, score=

[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=20, score=0.914, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=10, n_estimators=20 .............
[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=20, score=0.911, total=   0.7s
[CV] colsample_bytree=0.7, max_depth=10, n_estimators=20 .............
[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=20, score=0.906, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=10, n_estimators=25 .............
[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=25, score=0.910, total=   0.8s
[CV] colsample_bytree=0.7, max_depth=10, n_estimators=25 .............
[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=25, score=0.915, total=   0.8s
[CV] colsample_bytree=0.7, max_depth=10, n_estimators=25 .............
[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=25, score=0.912, total=   0.8s
[CV] colsample_bytree=0.7, max_depth=10, n_estimators=25 .............
[CV]  colsample_bytree=0.7, max_depth=10, n_estimators=25,

[CV]  colsample_bytree=0.7, max_depth=14, n_estimators=35, score=0.915, total=   1.5s
[CV] colsample_bytree=0.7, max_depth=14, n_estimators=35 .............
[CV]  colsample_bytree=0.7, max_depth=14, n_estimators=35, score=0.910, total=   1.5s
[CV] colsample_bytree=0.7, max_depth=14, n_estimators=40 .............
[CV]  colsample_bytree=0.7, max_depth=14, n_estimators=40, score=0.910, total=   1.6s
[CV] colsample_bytree=0.7, max_depth=14, n_estimators=40 .............
[CV]  colsample_bytree=0.7, max_depth=14, n_estimators=40, score=0.917, total=   1.6s
[CV] colsample_bytree=0.7, max_depth=14, n_estimators=40 .............
[CV]  colsample_bytree=0.7, max_depth=14, n_estimators=40, score=0.916, total=   1.6s
[CV] colsample_bytree=0.7, max_depth=14, n_estimators=40 .............
[CV]  colsample_bytree=0.7, max_depth=14, n_estimators=40, score=0.910, total=   1.7s
[CV] colsample_bytree=0.7, max_depth=16, n_estimators=20 .............
[CV]  colsample_bytree=0.7, max_depth=16, n_estimators=20,

[CV]  colsample_bytree=0.7, max_depth=None, n_estimators=25, score=0.884, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=None, n_estimators=30 ...........
[CV]  colsample_bytree=0.7, max_depth=None, n_estimators=30, score=0.890, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=None, n_estimators=30 ...........
[CV]  colsample_bytree=0.7, max_depth=None, n_estimators=30, score=0.894, total=   0.5s
[CV] colsample_bytree=0.7, max_depth=None, n_estimators=30 ...........
[CV]  colsample_bytree=0.7, max_depth=None, n_estimators=30, score=0.893, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=None, n_estimators=30 ...........
[CV]  colsample_bytree=0.7, max_depth=None, n_estimators=30, score=0.885, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=None, n_estimators=35 ...........
[CV]  colsample_bytree=0.7, max_depth=None, n_estimators=35, score=0.892, total=   0.7s
[CV] colsample_bytree=0.7, max_depth=None, n_estimators=35 ...........
[CV]  colsample_bytree=0.7, max_depth=None, n_

[Parallel(n_jobs=1)]: Done 440 out of 440 | elapsed:  5.1min finished


GridSearchCV(cv=4,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_para

In [42]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", clf2.best_params_)
print("Best Score found: ", np.sqrt(np.abs(clf2.best_score_)))

Best parameters found:  {'colsample_bytree': 0.7, 'max_depth': 12, 'n_estimators': 40}
Best Score found:  0.9562784278856985


In [43]:
best_random = clf2.best_estimator_
predictions = best_random.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
clf2.best_params_
    

Model Performance
Average Error: 1814.5112 degrees.
Accuracy = 89.18%.


{'colsample_bytree': 0.7, 'max_depth': 12, 'n_estimators': 40}

In [26]:
## Using RandomizedSearchCV for best score

clf = XGBRegressor()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 15)]
max_depth = [int(x) for x in np.linspace(2, 30, num = 20)]
max_depth.append(None)
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': n_estimators,
    'max_depth': max_depth
}
# randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=3)
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, n_iter=20, cv=4, verbose=3)

# Fit randomized_mse to the data
randomized_mse.fit(X_train, y_train)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Best Score found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV] n_estimators=38, max_depth=24, colsample_bytree=0.7 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=38, max_depth=24, colsample_bytree=0.7, score=0.902, total=   6.6s
[CV] n_estimators=38, max_depth=24, colsample_bytree=0.7 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s


[CV]  n_estimators=38, max_depth=24, colsample_bytree=0.7, score=0.909, total=   6.4s
[CV] n_estimators=38, max_depth=24, colsample_bytree=0.7 .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.9s remaining:    0.0s


[CV]  n_estimators=38, max_depth=24, colsample_bytree=0.7, score=0.907, total=   7.0s
[CV] n_estimators=38, max_depth=24, colsample_bytree=0.7 .............
[CV]  n_estimators=38, max_depth=24, colsample_bytree=0.7, score=0.903, total=   6.9s
[CV] n_estimators=150, max_depth=7, colsample_bytree=0.3 .............
[CV]  n_estimators=150, max_depth=7, colsample_bytree=0.3, score=0.887, total=   4.2s
[CV] n_estimators=150, max_depth=7, colsample_bytree=0.3 .............
[CV]  n_estimators=150, max_depth=7, colsample_bytree=0.3, score=0.893, total=   4.2s
[CV] n_estimators=150, max_depth=7, colsample_bytree=0.3 .............
[CV]  n_estimators=150, max_depth=7, colsample_bytree=0.3, score=0.892, total=   4.6s
[CV] n_estimators=150, max_depth=7, colsample_bytree=0.3 .............
[CV]  n_estimators=150, max_depth=7, colsample_bytree=0.3, score=0.880, total=   4.1s
[CV] n_estimators=29, max_depth=19, colsample_bytree=0.7 .............
[CV]  n_estimators=29, max_depth=19, colsample_bytree=0.7,

[CV]  n_estimators=66, max_depth=3, colsample_bytree=0.7, score=0.858, total=   1.2s
[CV] n_estimators=85, max_depth=13, colsample_bytree=0.7 .............
[CV]  n_estimators=85, max_depth=13, colsample_bytree=0.7, score=0.911, total=   6.8s
[CV] n_estimators=85, max_depth=13, colsample_bytree=0.7 .............
[CV]  n_estimators=85, max_depth=13, colsample_bytree=0.7, score=0.917, total=   7.2s
[CV] n_estimators=85, max_depth=13, colsample_bytree=0.7 .............
[CV]  n_estimators=85, max_depth=13, colsample_bytree=0.7, score=0.916, total=   7.9s
[CV] n_estimators=85, max_depth=13, colsample_bytree=0.7 .............
[CV]  n_estimators=85, max_depth=13, colsample_bytree=0.7, score=0.911, total=   7.2s
[CV] n_estimators=131, max_depth=7, colsample_bytree=0.7 .............
[CV]  n_estimators=131, max_depth=7, colsample_bytree=0.7, score=0.907, total=   3.2s
[CV] n_estimators=131, max_depth=7, colsample_bytree=0.7 .............
[CV]  n_estimators=131, max_depth=7, colsample_bytree=0.7, 

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  6.6min finished


Best parameters found:  {'n_estimators': 85, 'max_depth': 13, 'colsample_bytree': 0.7}
Lowest RMSE found:  0.9560462273536471


In [27]:
## Using RandomizedSearchCV for mean squared error

clf = XGBRegressor()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 15)]
max_depth = [int(x) for x in np.linspace(2, 30, num = 20)]
max_depth.append(None)
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': n_estimators,
    'max_depth': max_depth
}
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=3)
# Fit randomized_mse to the data
randomized_mse.fit(X_train, y_train)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] n_estimators=47, max_depth=9, colsample_bytree=0.7 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=47, max_depth=9, colsample_bytree=0.7, score=-9578440.080, total=   1.2s
[CV] n_estimators=47, max_depth=9, colsample_bytree=0.7 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  n_estimators=47, max_depth=9, colsample_bytree=0.7, score=-8647901.746, total=   1.2s
[CV] n_estimators=47, max_depth=9, colsample_bytree=0.7 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV]  n_estimators=47, max_depth=9, colsample_bytree=0.7, score=-9000863.846, total=   1.2s
[CV] n_estimators=47, max_depth=9, colsample_bytree=0.7 ..............
[CV]  n_estimators=47, max_depth=9, colsample_bytree=0.7, score=-9416982.071, total=   1.2s
[CV] n_estimators=20, max_depth=16, colsample_bytree=0.7 .............
[CV]  n_estimators=20, max_depth=16, colsample_bytree=0.7, score=-9641024.538, total=   1.0s
[CV] n_estimators=20, max_depth=16, colsample_bytree=0.7 .............
[CV]  n_estimators=20, max_depth=16, colsample_bytree=0.7, score=-8713500.834, total=   1.0s
[CV] n_estimators=20, max_depth=16, colsample_bytree=0.7 .............
[CV]  n_estimators=20, max_depth=16, colsample_bytree=0.7, score=-8914574.425, total=   1.0s
[CV] n_estimators=20, max_depth=16, colsample_bytree=0.7 .............
[CV]  n_estimators=20, max_depth=16, colsample_bytree=0.7, score=-9395728.931, total=   1.0s
[CV] n_estimators=94, max_depth=13, colsample_bytree=0.3 .............
[CV]  n_estimators

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   49.8s finished


Best parameters found:  {'n_estimators': 47, 'max_depth': 9, 'colsample_bytree': 0.7}
Lowest RMSE found:  3026.7221437833177


# Evaluate Our Model Further

Now we are going to shuffle the data over and over and apply our new model to the results to further determine if we want to use this model.

In [29]:
path = r'D:\RiceBootCamp\Homework\Oct 22\Project3\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
# remove unwanted feature
df =df.drop("tax", axis=1)
ss = preprocessing.StandardScaler()
df[['mileage', 'mpg', 'engineSize', 'year']] = ss.fit_transform(df[['mileage', 'mpg', 'engineSize', 'year']])
df['model'] = df['model'].str.strip()


In [30]:

le = preprocessing.LabelEncoder()
for column_name in df.columns:
    if(df[column_name].dtype == object):
        df[column_name] = le.fit_transform(df[column_name])
        # Step 2: Convert encoded labels to one-hot-encoding
        df[column_name] = to_categorical(df[column_name])

    else:
        pass

In [31]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85555, 8) (85555,)


In [32]:
# evaluate model with kfold
kfold = KFold(n_splits=10)
results = cross_val_score(randomized_mse.best_estimator_, X, y, cv=kfold, n_jobs=-1)
print("Results: %.2f (%.2f) accuracy" % (results.mean(), results.std()))

Results: 0.79 (0.08) accuracy


## Save The Model For Future Use

In [28]:
# save model
with open(f'best_xgb_model.pickle', 'wb') as f:
    pickle.dump(randomized_mse.best_estimator_, f)

## Test the model

In [138]:
values = ['Fox',2008,'Manual', 88102,'Petrol',46.3,1.2,'Volkswagen']


In [139]:
features = [np.array(values)]

In [140]:
df = pd.DataFrame(features,columns=['model','year','transmission','mileage','fuelType','mpg','engineSize','make'])
df

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,make
0,Fox,2008,Manual,88102,Petrol,46.3,1.2,Volkswagen


In [141]:
df['year'] = df['year'].astype(int)
df['mileage'] = df['mileage'].astype(int)
df['mpg'] = df['mpg'].astype(float)
df['engineSize'] = df['engineSize'].astype(float)
df['model'] = df['model'].astype(str)
df['transmission'] = df['transmission'].astype(str)
df['fuelType'] = df['fuelType'].astype(str)
df['make'] = df['make'].astype(str)



In [132]:
ss = preprocessing.StandardScaler()
df[['mileage', 'mpg', 'engineSize', 'year']] = ss.fit_transform(df[['mileage', 'mpg', 'engineSize', 'year']])



In [143]:
df

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,make
0,1.0,2008,1.0,88102,1.0,46.3,1.2,1.0


In [142]:

le = preprocessing.LabelEncoder()
for column_name in df.columns:
    if(df[column_name].dtype == object):
        df[column_name] = le.fit_transform(df[column_name])
        # Step 2: Convert encoded labels to one-hot-encoding
        df[column_name] = to_categorical(df[column_name])

    else:
        pass

In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1 non-null      float32
 1   year          1 non-null      float64
 2   transmission  1 non-null      float32
 3   mileage       1 non-null      float64
 4   fuelType      1 non-null      float32
 5   mpg           1 non-null      float64
 6   engineSize    1 non-null      float64
 7   make          1 non-null      float32
dtypes: float32(4), float64(4)
memory usage: 176.0 bytes


In [144]:
randomized_mse.predict(df)

array([23397.049], dtype=float32)

In [50]:
predictions = best_random.predict(X_test[:5])

In [51]:
predictions

array([13026.961, 11919.921, 21014.73 , 15248.218, 18660.814],
      dtype=float32)

In [56]:
y_test[:5].values

array([14644, 10095, 23025, 14299, 16995], dtype=int64)