#### Model Training

### Librairies

In [57]:

# reload modules before executing user code.
#%reload_ext autoreload
#%autoreload 2

import sys
from pathlib import Path
import dill
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import pendulum
import seaborn as sns
from loguru import logger
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (r2_score,
                             mean_squared_error,
                             mean_absolute_percentage_error,
                             max_error,
                            )
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from ydata_profiling import ProfileReport
from yellowbrick.regressor import ResidualsPlot


sys.path.append(str(Path.cwd().parent))
from settings.params import (DATA_DIR_INPUT,
                             DATA_DIR_OUTPUT,
                             MODEL_PARAMS,
                             REPORT_DIR,
                             TIMEZONE,
                            HOME_DIR 
                            )

set_config(display="diagram", print_changed_only=False)
pd.set_option("display.max_columns", None)

In [58]:
HOME_DIR

PosixPath('/home/dosecurity/Desktop/PythonProjects/ProjectFolder/mlops-project-dic3')

### Test and Train ingestion

In [27]:
train = pd.read_csv(str(DATA_DIR_OUTPUT) + '/train.csv')
train_data.describe()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P13,P14,P15,P16,P19,P20,P21,P22,P23,P24,P26,P27,P29,P30,P31,P32,P33,P35,P36,P37,Year,Month,Years Old,revenue
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,0.430657,1.430657,4.014599,4.408759,4.317518,4.372263,2.007299,3.357664,5.423358,5.153285,5.445255,5.489051,3.262774,5.080292,1.416058,1.386861,1.941606,4.905109,4.547445,2.270073,2.226277,3.423358,1.372263,1.470803,1.145985,3.135036,2.729927,1.941606,2.525547,1.138686,2.029197,2.211679,1.116788,2008.678832,7.058394,6.321168,4453533.0
std,0.496985,0.511567,2.910391,1.5149,1.032337,1.016462,1.20962,2.134235,2.296809,1.858567,1.834793,1.847561,1.910767,1.036527,2.729583,2.398677,3.505807,5.604467,3.708041,2.05263,1.23069,4.559609,2.304112,2.612024,2.067039,1.680887,5.536647,3.512093,5.230117,1.69854,3.436272,4.168211,1.790768,4.027359,3.590769,4.027359,2576072.0
min,0.0,0.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,1.0,4.0,4.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1996.0,1.0,1.0,1149870.0
25%,0.0,1.0,2.0,4.0,4.0,4.0,1.0,2.0,5.0,4.0,4.0,5.0,2.0,5.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2007.0,4.0,4.0,2999068.0
50%,0.0,1.0,3.0,5.0,4.0,4.0,2.0,3.0,5.0,5.0,5.0,5.0,3.0,5.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010.0,8.0,5.0,3939804.0
75%,1.0,2.0,4.0,5.0,5.0,5.0,2.0,4.0,5.0,5.0,5.0,5.0,4.0,5.0,2.0,2.0,3.0,5.0,5.0,3.0,3.0,5.0,2.0,2.5,2.0,3.0,4.0,3.0,3.0,2.0,4.0,3.0,2.0,2011.0,10.0,8.0,5166635.0
max,1.0,2.0,12.0,7.5,7.5,7.5,8.0,10.0,10.0,10.0,10.0,10.0,10.0,7.5,15.0,10.0,15.0,25.0,15.0,15.0,5.0,25.0,10.0,12.5,12.5,7.5,25.0,15.0,25.0,6.0,15.0,20.0,8.0,2014.0,12.0,19.0,19696940.0


In [28]:
test_data = pd.read_csv(str(DATA_DIR_OUTPUT) + '/test.csv')
test_data.head()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P13,P14,P15,P16,P19,P20,P21,P22,P23,P24,P26,P27,P29,P30,P31,P32,P33,P35,P36,P37,Year,Month,Years Old
0,1,1,1,4.0,4.0,4.0,1,2,5,4,5,5,5,4.0,0,0,0,5,5,3,1,4,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2011,1,4
1,1,2,3,4.0,4.0,4.0,2,2,5,3,4,4,2,5.0,0,0,0,5,5,3,2,1,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2011,3,4
2,0,1,3,4.0,4.0,4.0,2,2,5,4,4,5,4,5.0,0,0,0,5,5,5,5,5,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2013,10,2
3,1,2,2,4.0,4.0,4.0,2,3,5,4,5,4,3,5.0,0,0,0,4,4,3,2,2,0,0.0,0.0,3.0,0,4,0,0,0,0,0,2013,5,2
4,1,1,2,4.0,4.0,4.0,1,2,5,4,5,4,3,4.0,0,0,0,1,5,3,1,1,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2013,7,2


In [30]:
train_data['revenue'].describe()

count    1.370000e+02
mean     4.453533e+06
std      2.576072e+06
min      1.149870e+06
25%      2.999068e+06
50%      3.939804e+06
75%      5.166635e+06
max      1.969694e+07
Name: revenue, dtype: float64

### Splitting data test and data train

In [31]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

( len(train_data), len(val_data))

(109, 28)

In [32]:

y_train = train_data['revenue']    
X_train = train_data.drop(['revenue'],axis=1)
y_val = val_data['revenue']
X_val = val_data.drop(['revenue'],axis=1)
X_test = test_data

Normalisation des variables

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# transform data
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)
X_val_scale = scaler.transform(X_val)


As there is high dimensionality in data we can use Principal Component Analysis(PCA). It helps in reducing the dimensions of data and focus on the ones with the largest variance components.


In [34]:
from sklearn.decomposition import PCA
#https://www.geeksforgeeks.org/principal-component-analysis-with-python/

pca = PCA(n_components = 5)
X_train_PCA = pca.fit_transform(X_train) 
X_test_PCA = pca.transform(X_test) 

### Feature Engineering Pipeline

#### model fitting

In [35]:
def predict_model(model,X,y_act):
    y_pred = model.predict(X)
    rms = mean_squared_error(y_act, y_pred,squared=False)
    return rms

In [47]:
def testDataPred(model,X):
  y_test = model.predict(X)
  dataFrame = pd.DataFrame({'Id': test['Id'], 'Prediction': y_test}) 
  return dataFrame

### Linear avec le Lasso Regression

#### linear model avec le train set

In [37]:
from sklearn import linear_model
model_LR1 = linear_model.LassoCV(max_iter=10000,alphas=(0.0001,0.01,0.1,1),n_alphas=300,cv=5)
model_LR1.fit(X_train_scale,y_train)
LR_mse1 = predict_model(model_LR1,X_train_scale,y_train)
print('Train MSE score :',LR_mse1)

Train MSE score : 1813352.129708278


#### linear model avec le val set

In [40]:
model_LR1 = linear_model.LassoCV(max_iter=10000,alphas=(0.0001,0.01,0.1,1),n_alphas=300,cv=5)
model_LR1.fit(X_val_scale,y_val)
LR_mse_val = predict_model(model_LR1,X_val_scale,y_val)
print('Train MSE score :',LR_mse_val)

Train MSE score : 43922.64942492647


#### linear model avec l'analyse en composante princiapale

In [41]:
model_LR2 = linear_model.LassoCV(max_iter=100000,alphas=(0.0001,0.01,0.1,1),n_alphas=1000,cv=5)
model_LR2.fit(X_train_PCA,y_train)
LR_rmse_PCA = predict_model(model_LR2,X_train_PCA,y_train)
print('Train RMSE score :',LR_rmse_PCA)

Train RMSE score : 2166023.1916953875


### Random Forest

#### Random Forest  avec les données normalisés

In [42]:
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

params = {
    "n_estimators": randint(10,1000),
    "max_depth": randint(1,10),
    "min_samples_split": uniform(0.1,0.8),
    'max_features':['auto', 'sqrt', 'log2']
}

RF_model = RandomForestRegressor()
model_rf1 = RandomizedSearchCV(RF_model, params, cv=6, n_iter=100, scoring='neg_mean_squared_error', return_train_score=True, verbose=3, n_jobs=-1)
model_rf1.fit(X_train_scale, y_train)
RF_rmse1 = predict_model(model_rf1, X_train_scale, y_train)
print('Train RMSE score :',RF_rmse1)

Fitting 6 folds for each of 100 candidates, totalling 600 fits
[CV 4/6] END max_depth=1, max_features=sqrt, min_samples_split=0.58646375180942, n_estimators=130;, score=(train=-4680747940027.475, test=-3691055049924.634) total time=   0.2s
[CV 6/6] END max_depth=5, max_features=log2, min_samples_split=0.4701338374992735, n_estimators=971;, score=(train=-3558522282463.330, test=-8055841723250.839) total time=   1.6s
[CV 3/6] END max_depth=3, max_features=log2, min_samples_split=0.17747635871100506, n_estimators=343;, score=(train=-4028018492795.938, test=-2255146380144.991) total time=   0.6s
[CV 1/6] END max_depth=7, max_features=log2, min_samples_split=0.5632360173495173, n_estimators=785;, score=(train=-4761172940337.626, test=-2495111415261.723) total time=   1.3s
[CV 4/6] END max_depth=2, max_features=sqrt, min_samples_split=0.39210521638520035, n_estimators=897;, score=(train=-4337333202418.617, test=-3738796062232.966) total time=   1.7s
[CV 1/6] END max_depth=8, max_features=log

210 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
127 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/dosecurity/Deskto

Train RMSE score : 1984332.333300725


Train RMSE score : 1574549.756777705


174 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dosecurity/Desktop/notebooks/notebookEnv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dosecurity/Desktop/notebooks/notebookEnv/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/dosecurity/Desktop/notebooks/notebookEnv/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/dosecurity/Desktop/notebooks/notebookEnv/

#### Random Forest avec les données de validation

In [53]:
model_rf1.fit(X_train_scale, y_train)
RF_rmse_val = predict_model(model_rf1, X_val_scale, y_val)
print('Train RMSE score :',RF_rmse_val)

Fitting 6 folds for each of 100 candidates, totalling 600 fits
[CV 2/2] END colsample_bytree=0.1507683216206429, learning_rate=0.7467638493177157, max_depth=3, n_estimators=426, reg_alpha=1, reg_lambda=0.01;, score=(train=-0.206, test=-8193866792987.103) total time=   0.1s
[CV 1/2] END colsample_bytree=0.7846810561439995, learning_rate=0.09061115855413915, max_depth=3, n_estimators=453, reg_alpha=0.1, reg_lambda=0.0001;, score=(train=-4035868.528, test=-6203948899451.883) total time=   0.1s
[CV 2/2] END colsample_bytree=0.7846810561439995, learning_rate=0.09061115855413915, max_depth=3, n_estimators=453, reg_alpha=0.1, reg_lambda=0.0001;, score=(train=-305118.693, test=-8008812338530.051) total time=   0.1s
[CV 1/2] END colsample_bytree=0.630246332514264, learning_rate=0.27821018312313206, max_depth=2, n_estimators=488, reg_alpha=0.1, reg_lambda=1;, score=(train=-27427177.852, test=-5579658948550.213) total time=   0.1s
[CV 2/2] END colsample_bytree=0.630246332514264, learning_rate=0.2

234 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
222 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/dosecurity/Deskto

Train RMSE score : 3415142.9799212944


#### Random Forest Avec les données transformés avec le PCA

In [44]:
model_rf2 = RandomizedSearchCV(RF_model, params, cv=2, n_iter=100, scoring='neg_mean_squared_error', return_train_score=True, verbose=3, n_jobs=-1)
model_rf2.fit(X_train_PCA, y_train)
RF_rmse_PCA = predict_model(model_rf2, X_train_PCA, y_train)
print('Train RMSE score :',RF_rmse_PCA)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV 5/6] END max_depth=8, max_features=auto, min_samples_split=0.695211463834761, n_estimators=45;, score=(train=nan, test=nan) total time=   0.0s
[CV 6/6] END max_depth=8, max_features=auto, min_samples_split=0.695211463834761, n_estimators=45;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/6] END max_depth=8, max_features=sqrt, min_samples_split=0.3004682417015181, n_estimators=601;, score=(train=-4122629696927.730, test=-2526748556848.209) total time=   1.5s
[CV 1/6] END max_depth=8, max_features=sqrt, min_samples_split=0.11772210237973964, n_estimators=288;, score=(train=-3193326961599.251, test=-2597415479282.350) total time=   0.8s
[CV 6/6] END max_depth=8, max_features=sqrt, min_samples_split=0.11772210237973964, n_estimators=288;, score=(train=-2245297739002.042, test=-7458964468565.710) total time=   0.7s
[CV 2/6] END max_depth=4, max_features=sqrt, min_samples_split=0.12792900461987938, n_estimators=287;, s

86 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/dosecurity/Desktop/notebooks/notebook_python3.10/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/dosecurity/Desktop/

[CV 1/2] END max_depth=3, max_features=auto, min_samples_split=0.5844016177062263, n_estimators=981;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/2] END max_depth=3, max_features=auto, min_samples_split=0.5844016177062263, n_estimators=981;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/2] END max_depth=7, max_features=log2, min_samples_split=0.4403517996085353, n_estimators=370;, score=(train=-2605510091131.716, test=-3117076975518.376) total time=   0.5s
[CV 2/2] END max_depth=7, max_features=log2, min_samples_split=0.4403517996085353, n_estimators=370;, score=(train=-2221358820667.545, test=-3215582623051.255) total time=   0.5s
[CV 1/2] END max_depth=9, max_features=sqrt, min_samples_split=0.5461470797335548, n_estimators=471;, score=(train=-2720754054992.526, test=-3134919612305.695) total time=   0.7s
[CV 1/2] END max_depth=1, max_features=auto, min_samples_split=0.53505229843905, n_estimators=75;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/2] END max_de

[CV 5/6] END max_depth=2, max_features=log2, min_samples_split=0.1946679227744438, n_estimators=515;, score=(train=-2638465293657.306, test=-3194539264251.246) total time=   1.1s
[CV 6/6] END max_depth=2, max_features=log2, min_samples_split=0.1946679227744438, n_estimators=515;, score=(train=-2287854146585.569, test=-4785881190529.443) total time=   1.1s
[CV 5/6] END max_depth=7, max_features=sqrt, min_samples_split=0.5310839757313688, n_estimators=321;, score=(train=-2749381563991.745, test=-3211081343006.219) total time=   0.5s
[CV 6/6] END max_depth=7, max_features=sqrt, min_samples_split=0.5310839757313688, n_estimators=321;, score=(train=-2507390925987.331, test=-4688368380052.326) total time=   0.5s
[CV 2/6] END max_depth=1, max_features=sqrt, min_samples_split=0.6575765931959449, n_estimators=684;, score=(train=-3293195088259.891, test=-2310646405930.267) total time=   1.1s
[CV 4/6] END max_depth=1, max_features=sqrt, min_samples_split=0.6575765931959449, n_estimators=684;, sco

[CV 2/2] END max_depth=1, max_features=auto, min_samples_split=0.5046281731915508, n_estimators=349;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/2] END max_depth=6, max_features=auto, min_samples_split=0.49192027327094456, n_estimators=311;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/2] END max_depth=6, max_features=auto, min_samples_split=0.49192027327094456, n_estimators=311;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/2] END max_depth=3, max_features=log2, min_samples_split=0.7618380834920387, n_estimators=11;, score=(train=-3235121321914.016, test=-3225947496379.226) total time=   0.0s
[CV 2/2] END max_depth=3, max_features=log2, min_samples_split=0.7618380834920387, n_estimators=11;, score=(train=-3186864135720.639, test=-3266429943444.203) total time=   0.0s
[CV 1/2] END max_depth=1, max_features=log2, min_samples_split=0.1082136888086569, n_estimators=656;, score=(train=-2825833970947.045, test=-3145440476020.299) total time=   1.0s
[CV 1/2] END max

### XGBOOST

In [46]:
!pip install xgboost

[CV 2/2] END max_depth=6, max_features=auto, min_samples_split=0.5021877512812062, n_estimators=203;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/2] END max_depth=8, max_features=auto, min_samples_split=0.6896433450835935, n_estimators=757;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/2] END max_depth=8, max_features=auto, min_samples_split=0.6896433450835935, n_estimators=757;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/2] END max_depth=2, max_features=log2, min_samples_split=0.6855660074771801, n_estimators=299;, score=(train=-7627113943124.164, test=-2557444368418.425) total time=   0.5s
[CV 2/2] END max_depth=2, max_features=log2, min_samples_split=0.6855660074771801, n_estimators=299;, score=(train=-2476735121243.913, test=-7632738312297.536) total time=   0.5s
[CV 1/2] END max_depth=6, max_features=sqrt, min_samples_split=0.3648761339215313, n_estimators=499;, score=(train=-4765710535274.133, test=-2853104938534.225) total time=   1.0s
[CV 2/2] END max

#### XGBOOST avec les données normalisés : data train

In [51]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import xgboost as xgb
params = {
    "learning_rate": uniform(0.001,1),
    "n_estimators": randint(100,1000),
    "max_depth": randint(1,10),     
    "colsample_bytree": uniform(0.1,0.8),
    "reg_alpha": [0.0001,0.001,0.01,0.1,1,10],
    "reg_lambda": [0.0001,0.001,0.01,0.1,1,10]
}
xgb_model = xgb.XGBRegressor()
rand_xgb = RandomizedSearchCV(xgb_model, params, cv=2, n_iter=100, scoring='neg_mean_squared_error', return_train_score=True, verbose=3, n_jobs=-1)
rand_xgb.fit(X_train_scale, y_train)
XG_rmse1 = predict_model(rand_xgb, X_train_scale, y_train)
print('Train RMSE score :',XG_rmse1)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV 2/2] END colsample_bytree=0.8144556082806934, learning_rate=0.4549085720849121, max_depth=5, n_estimators=151, reg_alpha=0.0001, reg_lambda=0.1;, score=(train=-0.141, test=-18508661731373.750) total time=   0.0s
[CV 1/2] END colsample_bytree=0.6082163781408114, learning_rate=0.5758565199563193, max_depth=8, n_estimators=658, reg_alpha=0.1, reg_lambda=10;, score=(train=-21.040, test=-29657987127240.969) total time=   0.2s
[CV 2/2] END colsample_bytree=0.6082163781408114, learning_rate=0.5758565199563193, max_depth=8, n_estimators=658, reg_alpha=0.1, reg_lambda=10;, score=(train=-6.224, test=-14430637614692.732) total time=   0.2s
[CV 1/2] END colsample_bytree=0.2367958397464176, learning_rate=0.32988014690679546, max_depth=2, n_estimators=782, reg_alpha=0.01, reg_lambda=0.01;, score=(train=-0.787, test=-18190984186208.895) total time=   0.1s
[CV 2/2] END colsample_bytree=0.2367958397464176, learning_rate=0.32988014690679

#### XGBOOST avec les données normalisés : data validation

In [52]:
rand_xgb.fit(X_val_scale, y_val)
XG_rmse_val = predict_model(rand_xgb, X_val_scale, y_val)
print('Train RMSE score :',XG_rmse_val)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV 2/2] END colsample_bytree=0.8236330580275067, learning_rate=0.30218960431155373, max_depth=5, n_estimators=352, reg_alpha=1, reg_lambda=0.001;, score=(train=-0.346, test=-15895223016136.178) total time=   0.1s
[CV 1/2] END colsample_bytree=0.8068740339435363, learning_rate=0.3671996395330991, max_depth=3, n_estimators=528, reg_alpha=10, reg_lambda=0.0001;, score=(train=-14.952, test=-37473829229147.273) total time=   0.1s
[CV 2/2] END colsample_bytree=0.8068740339435363, learning_rate=0.3671996395330991, max_depth=3, n_estimators=528, reg_alpha=10, reg_lambda=0.0001;, score=(train=-16.626, test=-15925237506158.879) total time=   0.1s
[CV 1/2] END colsample_bytree=0.7541084186710867, learning_rate=0.45093937441389687, max_depth=3, n_estimators=859, reg_alpha=0.001, reg_lambda=0.0001;, score=(train=-0.194, test=-35714683981586.516) total time=   0.3s
[CV 2/2] END colsample_bytree=0.7541084186710867, learning_rate=0.450939

In [55]:
print('Linear Regression Train MSE score :',LR_mse1)
print('Linear Regression with validation Train MSE score :',LR_mse_val)
print('Linear Regression with PCA Train RMSE score :',LR_rmse_PCA)
print('Random Forest Train RMSE score :',RF_rmse1)
print('Random Forest with valdiation Train RMSE score :',RF_rmse_val)
print('Random Forest Train RMSE score :',RF_rmse_PCA)
print('XGBoost Train RMSE score :',XG_rmse1)
print('XGBoost Train RMSE score :',XG_rmse_val)

Linear Regression Train MSE score : 1813352.129708278
Linear Regression with validation Train MSE score : 43922.64942492647
Linear Regression with PCA Train RMSE score : 2166023.1916953875
Random Forest Train RMSE score : 3411777.095141441
Random Forest with valdiation Train RMSE score : 3415142.9799212944
Random Forest Train RMSE score : 1992971.44700496
XGBoost Train RMSE score : 1478749.4143190868
XGBoost Train RMSE score : 1311307.4224615647


En se basant sur le score des **Root Mean Squared Error**, nous pourrons dire que le meilleur modele est le XGBoost pour prédire nos données

### Engeristrement du model

In [59]:
import pickle
pickle_out = open(str(HOME_DIR) + "/models/bestmodel.pkl","wb")
pickle.dump(rand_xgb,pickle_out)
pickle_out.close()