In [24]:
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

import preprocessing
import data
import metrics
import parameters
import functions

from time import time

import shap
import matplotlib.pyplot as plt

import joblib

import pandas as pd
import numpy as np

In [25]:
#Data extraction for training model
df = data.data_extraction.BDD_Promo('csv')
#Data Cleaning
df = preprocessing.training_set_preprocessing.training_set_cleaning(df)

In [26]:
#Création de la grille de paramètres pour un grid search
xgb_grid = {'max_depth':[5,7,9], 
            'learning_rate':[0.1, 0.3],
            'n_estimators':[100,250,500] ,
            'verbosity':[1], 
            'silent':[0], 
            'objective':['reg:squarederror'],
            'booster':['gbtree'],
            'n_jobs':[-1], 
            'nthread':[-1], 
            'gamma':[0],  
            'reg_alpha':[0], 
            'reg_lambda':[1],
            'importance_type':['gain']}

In [27]:
X, feature_names = preprocessing.training_set_preprocessing.preco_features(df)
y, _ = preprocessing.training_set_preprocessing.preco_target(df)
    
    
#split train, validation and testing data
#cette partie pourra être remplacée par un CV de la librairie sklearn à posteriori
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.999, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.15, random_state=38)
    
_,features = preprocessing.training_set_preprocessing.preco_features(df)



#Reconstructing data Matrix for Model training
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    
    
#Model tuning
print("Performing Grid Search on the model")
gbm = xgb.XGBRegressor()

reg = GridSearchCV(gbm, param_grid = xgb_grid, cv = 3, verbose = 2, n_jobs = -1)
reg.fit(X_train,y_train)


Performing Grid Search on the model
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    2.4s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,...
             param_grid={'booster': ['gbtree'], 'gamma': [0],
                         'importance_type': ['gain'],
                         'learning_rate': [0.1, 0.3], 'max_depth': [5, 7, 9],
                         'n_estimators': [100, 250, 500], 'n_jobs': [-1],
                         'nthread': [-1], 'objective': ['reg:squarederror'],
                  

In [None]:
def train_validate_model(df):
    #Get data
    X, feature_names = preprocessing.training_set_preprocessing.preco_features(df)
    y, _ = preprocessing.training_set_preprocessing.preco_target(df)
    
    
    #split train, validation and testing data
    #cette partie pourra être remplacée par un CV de la librairie sklearn à posteriori
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.999, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.15, random_state=38)
    
    _,features = preprocessing.training_set_preprocessing.preco_features(df)



    #Reconstructing data Matrix for Model training
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    
    
    
    htuning = input('Do you want to perform hyperparameter tuning ? (Y/n)')
    if htuning == 'Y':
        #Model tuning
        print("Performing Grid Search on the model")
        gbm = xgb.XGBRegressor()

        reg = GridSearchCV(gbm, param_grid = xgb_grid, cv = 3, verbose = 2, n_jobs = -1)
        reg.fit(X_train,y_train)
        
        gbm = xgb.train(reg.best_params_, dtrain, num_boost_round = parameters.num_boost_round, evals=watchlist, early_stopping_rounds=parameters.early_stopping_rounds, verbose_eval=50) 
        
    else:
        #Model training
        print("Training a XGBoost model")
        gbm = xgb.train(parameters.xgb_params, dtrain, num_boost_round = parameters.num_boost_round, evals=watchlist, early_stopping_rounds=parameters.early_stopping_rounds, verbose_eval=50)
    

    
    
    #Print performance metrics
    print("\nPerformance on  training set")
    dtest = xgb.DMatrix(X_train[features])
    test_probs = gbm.predict(dtrain)
    error_test = metrics.rmspe(y_train.Ventes.values, test_probs)
    R_squarred = r2_score(y_train.Ventes.values, test_probs)
    adj_2 = metrics.adjusted_r2(feature_names,y_train.Ventes.values, test_probs)
    print('RMSPE: {:.6f}'.format(error_test))
    print('R Squarred: {:.6f}'.format(R_squarred))
    print('R Squarred (adj): {:.6f}'.format(adj_2))
    
    print("\nPerformance on Validation set")
    yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
    error = metrics.rmspe(y_valid.Ventes.values, yhat)
    R_squarred = r2_score(y_valid.Ventes.values, yhat)
    adj_2 = metrics.adjusted_r2(feature_names,y_valid.Ventes.values, yhat)
    print('RMSPE: {:.6f}'.format(error_test))
    print('R Squarred: {:.6f}'.format(R_squarred))
    print('R Squarred (adj): {:.6f}'.format(adj_2))
    
    

    print("\nPerformance on test set")
    dtest = xgb.DMatrix(X_test[features])
    test_probs = gbm.predict(dtest)
    error_test = metrics.rmspe(y_test.Ventes.values, test_probs)
    R_squarred = r2_score(y_test.Ventes.values, test_probs)
    adj_2 = metrics.adjusted_r2(feature_names,y_test.Ventes.values, test_probs)
    print('RMSPE: {:.6f}'.format(error_test))
    print('R Squarred: {:.6f}'.format(R_squarred))
    print('R Squarred (adj): {:.6f}'.format(adj_2))
    
    
    
    # Evaluation metrics
    #------------------- Plot feature importances ----------------
    xgb.plot_importance(booster = gbm, show_values = False, importance_type = 'gain')
    plt.show()
    
    # ------------------- Perform SHAP Analysis on training data --------------------
    #SHAP_Analysis(gbm, X_train, y_train, feature_names)
    
    ask_save_params = input('Do you want to save these hyperparameters ? (Y/n)')
    
    if (htuning == 'Y'):
        if (ask_save_params == 'Y'):
            functions.save_obj(reg.best_params_, 'xgb_params' )
        else:
        

    return gbm, watchlist

In [None]:
train_validate_model(df)