# XGBRegressor Pipeline

In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score

Different brands and models were concatenated to one dataframe

In [2]:
# Read and concatenate all CSV's

path = r'D:\Data_Analytics\Project 3_Branch\UsedCarsValuePredictionML\Data'
all_files = glob.glob(path + "/*.csv")

li = []
brands = ["Audi","BMW","Ford","Hyundi","Mercedes Benz","Skoda","Toyota","Volkswagen"]

for filename, brand in zip(all_files, brands):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["make"] = brand
    li.append(df)
    
frame = pd.concat(li, axis=0, ignore_index=True)
frame


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [3]:
# remove unwanted feature
frame =frame.drop("tax", axis=1)

In [4]:
frame_filtered = frame.loc[frame['year'] != 2060]
frame_filtered

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,46.3,1.2,Volkswagen


In [5]:
# identified outliers
index_list = [9434,10109,7221,7845,17753,14988,14306,33361,22488,43661,44279,62386]

#outliers
frame_filtered[frame_filtered.index.isin(index_list)]


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
7221,A5,2020,59995,Semi-Auto,2000,Diesel,40.9,3.0,Audi
7845,A6,2018,59950,Automatic,22000,Petrol,29.4,4.0,Audi
9434,A8,2020,78990,Automatic,250,Diesel,39.2,3.0,Audi
10109,S3,2003,4990,Manual,106000,Petrol,39.8,1.8,Audi
14306,2 Series,2015,123456,Semi-Auto,33419,Diesel,68.9,2.0,BMW
14988,5 Series,2020,54845,Semi-Auto,450,Diesel,60.1,3.0,BMW
17753,3 Series,2020,71990,Semi-Auto,150,Diesel,47.1,3.0,BMW
22488,Focus,2017,38015,Manual,197,Diesel,74.3,1.5,Ford
33361,Focus,2018,54995,Manual,11000,Petrol,36.7,2.3,Ford
43661,I10,2017,92000,Automatic,35460,Petrol,47.9,1.2,Hyundi


In [6]:
#remove outliers
frame_filtered = frame_filtered.drop(index_list)

In [7]:
frame_filtered

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,make
0,A1,2017,12500,Manual,15735,Petrol,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,49.6,1.0,Audi
...,...,...,...,...,...,...,...,...,...
85550,Eos,2012,5990,Manual,74000,Diesel,58.9,2.0,Volkswagen
85551,Fox,2008,1799,Manual,88102,Petrol,46.3,1.2,Volkswagen
85552,Fox,2009,1590,Manual,70000,Petrol,42.0,1.4,Volkswagen
85553,Fox,2006,1250,Manual,82704,Petrol,46.3,1.2,Volkswagen


# Pipeline

Categorical and Numeric Features were preprocessed separately

In [8]:
# define numeric features

num_features = ['year','mileage','mpg','engineSize']

# Pipeline steps to transform numeric features
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                      ('scaler', StandardScaler())])

In [9]:
# define categorical features

cat_features = ['model','transmission','fuelType','make']

# Pipeline to transform (encode) categorical features

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [10]:
# Use "ColumnTransformer" to join both categorical and numeric pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

# Adriana's Step

In [11]:
# transformer = Pipeline(steps=[('preprocessor', preprocessor)])

In [12]:
# Define X, y

# X = frame_filtered.drop('price', axis = 1)
# y = frame_filtered['price']

In [13]:
# transformed_axis = transformer.fit(X,y)
# transformed_axis

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                           

In [14]:
# pkl_filename = "axis_transformed_v3.pkl"
# with open(pkl_filename, 'wb') as file:
#     pickle.dump(transformed_axis, file)

# Continue ...

In [None]:
# define Lasso model
regressor_model = XGBRegressor()

In [None]:
# Create a pipeline that execute "preprocessor" and the Lasso model
regression = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',regressor_model )])

In [None]:
# Define X, y

X = frame_filtered.drop('price', axis = 1)
y = frame_filtered['price']

In [None]:
# Split X,y into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
X_train

In [None]:
# search hyperparameters

regression.get_params().keys()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 40, num = 5)]
max_depth = [int(x) for x in np.linspace(2, 18, num = 10)]
max_depth.append(None)
gbm_param_grid = {
    'regressor__colsample_bytree': [0.3, 0.7],
    'regressor__n_estimators': n_estimators,
    'regressor__max_depth': max_depth
}


In [None]:
# apply GridSearchCV for tuning
search = GridSearchCV(estimator = regression, param_grid= gbm_param_grid, cv=4, verbose = 3)

In [None]:
# fit the model
search.fit(X_train, y_train)

In [None]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", search.best_params_)
print("Best Score found: ", np.sqrt(np.abs(search.best_score_)))

In [None]:
# check test accuracy
print('Test Acc: %.3f' % search.score(X_test, y_test))

In [None]:
# check best score
print('Test Acc: %.3f' % search.best_score_)

In [None]:
# define model "best params"
search.best_params_

# Saving the Model

In [None]:
pkl_filename = "regression_XGB_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(search, file)

# Real Predictions

In [None]:
values = ['Yaris',2015,'Automatic', 36021,'Hybrid',78,1.5,'Toyota']

In [None]:
features = [np.array(values)]

In [None]:
df = pd.DataFrame(features,columns=['model','year','transmission','mileage','fuelType','mpg','engineSize','make'])
df

In [None]:
pkl_filename = "regression_XGB_model.pkl"
with open(pkl_filename, 'rb') as file:
    model_linear = pickle.load(file)

In [None]:
model_linear.predict(df)