# Applying various Regression Model

###  The model uses lableencoding , XGboost , lightgbm
## Please install  xgboost from PIP 


In [68]:
import numpy as np
import pandas as pd
import glob
import os
import pickle
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

## DATA

In this case we are reading used cars data.  With this data we are trying to predict the selling price of car in pounds

#### The features are:
- Model and make of the car 
- Purchase year of the car
- Transmission
- Mileage
- FuelType
- MPG
- EngineSize


In [2]:
path = r'D:\RiceBootCamp\Homework\Oct 22\Project3\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
df


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [3]:
# remove unwanted feature
df =df.drop("tax", axis=1)
df['year'] = df['year'].astype(str)
df['model'] = df['model'].str.strip()

In [None]:
df.info()

In [None]:
df.head()

## Preliminary Data Analysis

In [5]:
df.isnull().sum(axis=0)

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
mpg             0
engineSize      0
make            0
dtype: int64

## Train Test Split

In [6]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85555, 8) (85555,)


In [7]:
# split to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
del df,X,y

## EVALUATE FUNCTION

A single function that will evaluate all models 

This will allow us to easily pick out the model we want to move forward with.

This function takes in a model ( pipeline ) and our train test split data. From there it simply performes predictions and generates results

In [18]:
def evaluate(pipeline, X_train, X_test, y_train, y_test):
    '''
    Evaluate a pipeline on training and test datasets
    '''    
    pipeline.fit(X_train, y_train)
    
    test_acc = pipeline.score(X_test, y_test)

    print(f"========== Predictor: {type(pipeline).__name__} ==========")
    print(f"Test result: f1: , acc: {test_acc:.3f}")
    print()


## Pick A Model For A Base Point To Evaluate Other Models Against

In this case we are choosing Logistric Regression

In [29]:
# try XGBRegressor
# evaluate(XGBRegressor(n_jobs=-1), X_train, X_test, y_train, y_test)
XGBrf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBRegressor(n_jobs=-1))
])
evaluate(XGBrf, X_train, X_test, y_train, y_test)

Test result: f1: , acc: 0.948



In [30]:
LGBrf =  Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',LGBMRegressor(n_jobs=-1))
])

evaluate(LGBrf, X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.941



In [31]:
rf =  Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',RandomForestRegressor(n_estimators=140, min_samples_split=5, min_samples_leaf=4, max_features="sqrt", max_depth=20,bootstrap=False))
])
evaluate(rf, X_train, X_test, y_train, y_test)


Test result: f1: , acc: 0.901



In [32]:
GBrf =  Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',GradientBoostingRegressor())
])
evaluate(GBrf, X_train, X_test, y_train, y_test)

Test result: f1: , acc: 0.886



## Tuning final model and finding the best parameters for the model

The RandomizedSearchCV function will try all our combinations above and select the most accurate model.  

That best model is found in the best_estimator_ property of the RandomizedSerachCV object. 

In [40]:
XGBF = XGBRegressor(n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBF)
])
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 40, num = 5)]
max_depth = [int(x) for x in np.linspace(2, 18, num = 10)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}

clf2 = GridSearchCV(estimator=clf, param_grid=gbm_param_grid, cv=4, verbose=3)

In [42]:
clf2.fit(X_train, y_train)


Fitting 4 folds for each of 110 candidates, totalling 440 fits
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.816, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.820, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.816, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=20, score=0.812, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25, score=0.830, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25, score=0.834, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=2, regressor__n_estimators=25, score=0.830, total=   0.6s
[CV] regressor__colsample_bytree=0.3, regressor__m

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=20, score=0.905, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=20, score=0.896, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25, score=0.914, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25, score=0.914, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=5, regressor__n_estimators=25, score=0.912, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__m

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=20, score=0.938, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=20, score=0.932, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25, score=0.946, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25, score=0.947, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=9, regressor__n_estimators=25, score=0.942, total=   0.9s
[CV] regressor__colsample_bytree=0.3, regressor__m

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=20, score=0.946, total=   1.0s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=20, score=0.940, total=   1.0s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25, score=0.953, total=   1.0s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25, score=0.955, total=   1.0s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=12, regressor__n_estimators=25, score=0.948, total=   1.0s
[CV] regressor__colsample_bytree=0.3, reg

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=20, score=0.950, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=20, score=0.945, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25, score=0.956, total=   1.2s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25, score=0.959, total=   1.3s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=16, regressor__n_estimators=25, score=0.952, total=   1.2s
[CV] regressor__colsample_bytree=0.3, reg

[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20, score=0.921, total=   0.7s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20, score=0.915, total=   0.8s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=20, score=0.912, total=   0.8s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25, score=0.927, total=   0.8s
[CV] regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.3, regressor__max_depth=None, regressor__n_estimators=25, score=0.927, total=   0.8s
[CV] regressor__colsamp

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.861, total=   0.7s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.864, total=   0.7s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.859, total=   0.7s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=20, score=0.848, total=   0.7s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=3, regressor__n_estimators=25, score=0.870, total=   0.8s
[CV] regressor__colsample_bytree=0.7, regressor__m

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.935, total=   1.0s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.935, total=   1.0s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.931, total=   1.0s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=20, score=0.927, total=   1.0s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=7, regressor__n_estimators=25, score=0.938, total=   1.0s
[CV] regressor__colsample_bytree=0.7, regressor__m

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.954, total=   1.2s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.954, total=   1.1s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.949, total=   1.2s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=20, score=0.944, total=   1.2s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=10, regressor__n_estimators=25, score=0.955, total=   1.3s
[CV] regressor__colsample_bytree=0.7, reg

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.961, total=   1.5s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.961, total=   1.4s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.952, total=   1.5s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=20, score=0.951, total=   1.5s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=14, regressor__n_estimators=25, score=0.962, total=   1.6s
[CV] regressor__colsample_bytree=0.7, reg

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.961, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.963, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.954, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=20, score=0.953, total=   1.8s
[CV] regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=25 
[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=18, regressor__n_estimators=25, score=0.961, total=   2.0s
[CV] regressor__colsample_bytree=0.7, reg

[CV]  regressor__colsample_bytree=0.7, regressor__max_depth=None, regressor__n_estimators=40, score=0.928, total=   1.2s


[Parallel(n_jobs=1)]: Done 440 out of 440 | elapsed:  8.9min finished


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        missing_values=None,
                                                                                                        strategy='constant')),
                                                                                         ('onehotencoder',
                                                                                          OneHotEncoder(categories=[array(['A1', 'A6', 'A4', 'A3', 'Q3', 'Q5', 'A5', 

In [43]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", clf2.best_params_)
print("Best Score found: ", np.sqrt(np.abs(clf2.best_score_)))

Best parameters found:  {'regressor__colsample_bytree': 0.7, 'regressor__max_depth': 18, 'regressor__n_estimators': 40}
Best Score found:  0.979077215127871


In [44]:
best_random = clf2.best_estimator_
predictions = best_random.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
clf2.best_params_
    

Model Performance
Average Error: 1194.8005 degrees.
Accuracy = 92.53%.


{'regressor__colsample_bytree': 0.7,
 'regressor__max_depth': 18,
 'regressor__n_estimators': 40}

In [45]:
## Using RandomizedSearchCV for best score

XGBF = XGBRegressor(n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBF)
])
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 15)]
max_depth = [int(x) for x in np.linspace(2, 30, num = 20)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}

randomized_acc = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, n_iter=20, cv=4, verbose=3)

# Fit randomized_mse to the data
randomized_acc.fit(X_train, y_train)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_acc.best_params_)
print("Best Score found: ", np.sqrt(np.abs(randomized_acc.best_score_)))

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV] regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3, score=0.958, total=   1.5s
[CV] regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3, score=0.961, total=   1.5s
[CV] regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.9s remaining:    0.0s


[CV]  regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3, score=0.955, total=   1.5s
[CV] regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=47, regressor__max_depth=13, regressor__colsample_bytree=0.3, score=0.950, total=   1.5s
[CV] regressor__n_estimators=57, regressor__max_depth=21, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=57, regressor__max_depth=21, regressor__colsample_bytree=0.3, score=0.961, total=   2.6s
[CV] regressor__n_estimators=57, regressor__max_depth=21, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=57, regressor__max_depth=21, regressor__colsample_bytree=0.3, score=0.964, total=   2.6s
[CV] regressor__n_estimators=57, regressor__max_depth=21, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=57, regressor__max_depth=21, regressor__colsample_bytree=0.3, score=0.957, total=   2.6s
[CV] regressor__n_estimators=57, regresso

[CV]  regressor__n_estimators=112, regressor__max_depth=28, regressor__colsample_bytree=0.7, score=0.961, total=   9.8s
[CV] regressor__n_estimators=112, regressor__max_depth=28, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=112, regressor__max_depth=28, regressor__colsample_bytree=0.7, score=0.951, total=   9.7s
[CV] regressor__n_estimators=112, regressor__max_depth=28, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=112, regressor__max_depth=28, regressor__colsample_bytree=0.7, score=0.951, total=   9.8s
[CV] regressor__n_estimators=85, regressor__max_depth=6, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=85, regressor__max_depth=6, regressor__colsample_bytree=0.3, score=0.948, total=   1.4s
[CV] regressor__n_estimators=85, regressor__max_depth=6, regressor__colsample_bytree=0.3 
[CV]  regressor__n_estimators=85, regressor__max_depth=6, regressor__colsample_bytree=0.3, score=0.948, total=   1.4s
[CV] regressor__n_estimators=85, regress

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  5.2min finished


Best parameters found:  {'regressor__n_estimators': 94, 'regressor__max_depth': 15, 'regressor__colsample_bytree': 0.3}
Best Score found:  0.979408283569919


In [46]:
## Using RandomizedSearchCV for mean squared error

XGBF = XGBRegressor(n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocess', processor_lin),
    ('regressor',XGBF)
])
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 15)]
max_depth = [int(x) for x in np.linspace(2, 30, num = 20)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=clf, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=3)
# Fit randomized_mse to the data
randomized_mse.fit(X_train, y_train)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7, score=-4360019.919, total=   9.2s
[CV] regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s


[CV]  regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7, score=-3878118.448, total=   9.3s
[CV] regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.4s remaining:    0.0s


[CV]  regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7, score=-4946091.860, total=   9.1s
[CV] regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=140, regressor__max_depth=22, regressor__colsample_bytree=0.7, score=-4966964.286, total=   9.3s
[CV] regressor__n_estimators=66, regressor__max_depth=3, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=66, regressor__max_depth=3, regressor__colsample_bytree=0.7, score=-9684335.409, total=   1.1s
[CV] regressor__n_estimators=66, regressor__max_depth=3, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=66, regressor__max_depth=3, regressor__colsample_bytree=0.7, score=-9539876.683, total=   1.1s
[CV] regressor__n_estimators=66, regressor__max_depth=3, regressor__colsample_bytree=0.7 
[CV]  regressor__n_estimators=66, regressor__max_depth=3, regressor__colsample_bytree=0.7, score=-10012931.620, total=   1.1s
[CV] reg

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.6min finished


Best parameters found:  {'regressor__n_estimators': 66, 'regressor__max_depth': 19, 'regressor__colsample_bytree': 0.7}
Lowest RMSE found:  2101.7926037254456


# Evaluate Our Model Further

Now we are going to shuffle the data over and over and apply our new model to the results to further determine if we want to use this model.

In [47]:
path = r'D:\RiceBootCamp\Homework\Oct 22\Project3\Data' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    frame = pd.read_csv(filename, index_col=None, header=0)
    frame["make"] = brand
    li.append(frame)
    
df = pd.concat(li, axis=0, ignore_index=True)
# remove unwanted feature
df =df.drop("tax", axis=1)
df['year'] = df['year'].astype(str)
df['model'] = df['model'].str.strip()

In [48]:
# Assign X (data) and y (target)
X = df.drop("price", axis=1)
y = df["price"]
print(X.shape, y.shape)

(85555, 8) (85555,)


In [49]:
# evaluate model with kfold
kfold = KFold(n_splits=10)
results = cross_val_score(randomized_mse.best_estimator_, X, y, cv=kfold, n_jobs=-1)
print("Results: %.2f (%.2f) accuracy" % (results.mean(), results.std()))

Results: 0.88 (0.07) accuracy


## Save The Model For Future Use

In [50]:
# save model
with open(f'best_xgb_model.pickle', 'wb') as f:
    pickle.dump(randomized_mse.best_estimator_, f)

## Test the model

In [57]:
values = ['Fox',2008,'Manual', 88102,'Petrol',46.3,1.2,'Volkswagen']


In [58]:
features = [np.array(values)]

In [59]:
df_deploy = pd.DataFrame(features,columns=['model','year','transmission','mileage','fuelType','mpg','engineSize','make'])
df_deploy

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,make
0,Fox,2008,Manual,88102,Petrol,46.3,1.2,Volkswagen


In [60]:
randomized_mse.predict(df_deploy)

array([2191.308], dtype=float32)

In [65]:
predictions = best_random.predict(X_test[:15])

In [66]:
predictions

array([13041.969, 10966.173, 20531.328, 14382.786, 17913.36 , 19401.182,
       10256.589, 18317.13 , 17751.086, 24753.834, 39171.41 ,  8818.764,
       13973.687, 20089.11 , 14364.556], dtype=float32)

In [67]:
y_test[:15].values

array([14644, 10095, 23025, 14299, 16995, 19691,  9917, 17199, 18498,
       17299, 25945, 11500, 14499, 20000, 15000], dtype=int64)

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,make
13058,1 Series,2015,Manual,31900,Diesel,62.8,2.0,BMW
37741,Fiesta,2018,Manual,22790,Petrol,58.9,1.0,Ford
61979,Kodiaq,2018,Manual,15263,Diesel,52.3,2.0,Skoda
36235,Fiesta,2019,Manual,4216,Petrol,58.9,1.0,Ford
6651,Q3,2016,Automatic,29985,Diesel,51.4,2.0,Audi
70977,T-Roc,2018,Manual,25349,Diesel,56.5,2.0,Volkswagen
62538,Octavia,2016,Automatic,52013,Diesel,74.3,1.6,Skoda
20212,5 Series,2016,Automatic,14005,Diesel,62.8,2.0,BMW
23491,Focus,2018,Manual,26904,Diesel,67.3,2.0,Ford
37267,Kuga,2019,Automatic,24277,Diesel,37.2,2.0,Ford
