### Importing librairies

In [227]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

import pickle

### Importing the dataset

In [3]:
complet_df = pd.read_csv("complet_df.csv")

In [4]:
# time_slot -> int, car les catégorie n'ont pas d'ordre entre elles (ex: pays)
time_slot_dict = {time_slot:i for i,time_slot in enumerate(complet_df["time_slot"].unique())}
complet_df["time_slot"] = complet_df["time_slot"].apply(lambda x : time_slot_dict[x])

### Train test split

In [5]:
X = complet_df.drop(columns=["ongoing_incident"])
y = complet_df["ongoing_incident"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

### XGBoost classifier

In [132]:
# fit model no training data
model2 = xgb.XGBClassifier(random_state=123, enable_categorical = True, tree_method="hist")
model2.fit(X_train, y_train)

In [133]:
y_pred = model2.predict_proba(X_validation)[:,1] #ATTENTION !!!!!!!

In [134]:
roc_auc_score(y_validation, y_pred)

0.6784851242014226

In [135]:
y_pred_train = model2.predict_proba(X_train)[:,1] #ATTENTION !!!!!!!

In [136]:
roc_auc_score(y_train, y_pred_train) #Model overfit fortement, il faut le simplifier

0.9987449230162071

#### Gird search on XGBoost model to simplify model


In [168]:
model = xgb.XGBClassifier(random_state=123)
max_depth = [4, 6, 7, 8]
eta = [0.9, 0.1, 0.12]
gamma = [5, 6,7]
param_grid = dict(eta = eta, gamma = gamma,max_depth=max_depth)
grid_search1 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1, cv=3, return_train_score=True)

In [169]:
grid_result1 = grid_search1.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [170]:
grid_result1.best_params_

{'eta': 0.1, 'gamma': 6, 'max_depth': 6}

In [171]:
grid_result1.best_score_

0.674841473645443

In [173]:
df = pd.DataFrame(grid_result1.cv_results_)
df.sort_values(by=["mean_test_score"], ascending=False).head(1).mean_train_score
# df[df["param_max_depth"] == 6][df['param_eta'] == 1][df['param_min_samples_split'] == 3]

17    0.839499
Name: mean_train_score, dtype: float64

#### Grid search 2

In [13]:
model = xgb.XGBClassifier(random_state=123)
max_depth = [3, 4, 5]
eta = [0.08, 0.1, 0.15]
gamma = [3]
param_grid = dict(eta = eta, gamma = gamma,max_depth=max_depth)

grid_search2 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)

In [14]:
grid_result2 = grid_search2.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [15]:
grid_result2.best_score_

0.6744970493377279

In [40]:
grid_result2.best_params_

{'eta': 0.08, 'gamma': 3, 'max_depth': 4, 'n_estimators': 300}

In [49]:
grid_result2.best_score_

0.6790422607164192

#### Grid search 3

In [44]:
model = xgb.XGBClassifier(random_state=123)
n_estimators = [10,50,200,300]
param_grid = dict(n_estimators=n_estimators)

grid_search3 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)

In [45]:
grid_result3 = grid_search3.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [46]:
grid_result3.best_score_

0.6633066254542526

In [47]:
grid_result3.best_params_

{'n_estimators': 200}

In [50]:
grid_result3.best_score_

0.6633066254542526

overfit de fou, random forest avec nb arbe faible arbres et max_depth faible, voir logistique regression, decision tree simple

analyser corrélation entre les variables (corr)
analysr corrélations entre les variables et la target (select kbest)
si overfit model -> marche sur train mais généralise pas -> simplifier le modèle (changer model ou hyper paramètres)


### Logistic regression

Ne pas oublier le scaler !!!

In [66]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [68]:
# Grid search on logistic regression

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=123, max_iter=1000)
penalty = [None, 'l1', 'l2']
solver = ['lbfgs','liblinear', 'saga']
C = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = dict(C=C, penalty=penalty, solver=solver)

grid_search4 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1, return_train_score=True)

In [69]:
grid_result4 = grid_search4.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


60 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1227, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/Users/dogmael/anacond

In [95]:
print(grid_result4.best_score_)
print(grid_result4.best_params_)
# pd.DataFrame(grid_result4.cv_results_[grid_result4.cv_results_["param_C" == 0.01 and "param_penalty" == 'l1' and "param_solver" == 'liblinear']
# 
df = pd.DataFrame(grid_result4.cv_results_)

df[df["param_C"] == 0.01][df["param_penalty"] == 'l1'][df["param_solver"] == 'liblinear']

0.6144212397561495
{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


  df[df["param_C"] == 0.01][df["param_penalty"] == 'l1'][df["param_solver"] == 'liblinear']
  df[df["param_C"] == 0.01][df["param_penalty"] == 'l1'][df["param_solver"] == 'liblinear']


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
13,0.050839,0.006173,0.005866,0.000124,0.01,l1,liblinear,"{'C': 0.01, 'penalty': 'l1', 'solver': 'liblin...",0.606968,0.588574,...,0.597519,0.007367,39,0.596936,0.605929,0.602405,0.604821,0.602665,0.602551,0.003102


In [99]:
sorted(zip(grid_result4.best_estimator_.coef_[0], X_train.columns))

[(-0.13223375965423642, 'weekday_number'),
 (-0.11050832793333883, 'pressure'),
 (-0.09100955413755292, 'dew_point'),
 (-0.07706968529516922, 'holiday_departure_zone_B'),
 (-0.07047139260017717, 'holiday_day_zone_C'),
 (-0.060511502197269175, 'last_day_holidays_zone_C'),
 (-0.0574646764556341, 'holiday_day_zone_A'),
 (-0.022246386719048526, 'humidity'),
 (-0.011932345198162077, 'clouds'),
 (-0.0023321388924338036, 'holiday_departure_zone_A'),
 (-0.0015480544638712028, 'holiday_day_zone_B'),
 (0.0, 'accumulated'),
 (0.0, 'convective'),
 (0.0, 'first_day_holidays_zone_A'),
 (0.0, 'first_day_holidays_zone_B'),
 (0.0, 'ground_pressure'),
 (0.0, 'ice'),
 (0.0, 'month_number'),
 (0.0, 'public_holiday'),
 (0.0, 'rate'),
 (0.0, 'snow_depth'),
 (0.0, 'temperature'),
 (0.0015495647649374643, 'last_day_holidays_zone_A'),
 (0.0025214196331707767, 'holiday_departure_zone_C'),
 (0.007444524138430472, 'wind_speed'),
 (0.011459307063099772, 'first_day_holidays_zone_C'),
 (0.015095138537976012, 'last_d

In [None]:
grid_result4 = grid_search4.fit(X_train_scaled, y_train)

In [None]:
print(grid_result4.best_score_)
print(grid_result4.best_params_)
print(grid_result4.best_params_)

### Random forest

In [203]:
# Grid search on random forest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=123, min_samples_split=2,min_samples_leaf=1)
n_estimators = [500,2000]
max_depth = [30,40,50]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

grid_search5 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=12, verbose=1, return_train_score=True)

grid_result5 = grid_search5.fit(X_train, y_train)

print(grid_result5.best_score_)
print(grid_result5.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
0.6943449511733075
{'max_depth': 50, 'n_estimators': 4000}


In [200]:
df = pd.DataFrame(grid_search5.cv_results_)
df.sort_values(by=["mean_test_score"], ascending=False).head(1).mean_train_score

5    0.680155
Name: mean_train_score, dtype: float64

In [212]:
model = RandomForestClassifier(random_state=123, min_samples_split=2,min_samples_leaf=1, n_estimators=500, max_depth=40)
model.fit(X_train, y_train)

In [213]:
y_pred = model.predict_proba(X_validation)[:,1]
roc_auc_score(y_validation, y_pred)

0.7031233796092474

In [None]:
SVC, decision tree

### Features selection

In [141]:
# Recurcive feature elimination
model = xgb.XGBClassifier(random_state=123,max_depth=4, eta=0.1, gamma=4)
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X_train, y_train)

In [142]:
y_predict_validation = fit.predict_proba(X_validation)[:,1]
roc_auc_score(y_validation, y_predict_validation)

0.6915490201833426

In [143]:
y_predict_train = fit.predict_proba(X_train)[:,1]
print(roc_auc_score(y_train, y_predict_train))

0.7034529371719123


In [144]:
selected_features = X_train.columns[rfe.support_]
selected_features

Index(['month_number', 'weekday_number', 'time_slot', 'holiday_day_zone_A',
       'dew_point'],
      dtype='object')

In [10]:
#Without RFE
model = xgb.XGBClassifier(random_state=123,max_depth=4, eta=0.1, gamma=4)
model.fit(X_train, y_train)
y_predict_validation = model.predict_proba(X_validation)[:,1]
print(roc_auc_score(y_validation, y_predict_validation))

0.6897121696848738


In [11]:
y_predict_train = model.predict_proba(X_train)[:,1]
print(roc_auc_score(y_train, y_predict_train))

0.8157900700972858


In [None]:
model = xgb.XGBClassifier(random_state=123)
max_depth = [3, 4, 5]
eta = [0.09, 0.1, 0.12]
gamma = [3, 4, 5, 6]
param_grid = dict(eta = eta, gamma = gamma,max_depth=max_depth)
grid_search1 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_result1 = grid_search1.fit(X_train, y_train)
grid_result1.best_params_
grid_result1.best_score_

In [86]:
grid_result1.best_params_

{'eta': 0.12, 'gamma': 5, 'max_depth': 5}

### Decision tree

In [126]:
# Grid search on decision tree classifier

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=123)
max_depth = [1,2,3,4,5, 6, 7]
min_samples_split = [1, 2, 3, 4]
min_samples_leaf = [1, 2, 3]

param_grid = dict(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

grid_search6 = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1, return_train_score=True)

In [127]:
grid_result6 = grid_search6.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


105 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/dogmael/anaconda3/envs/bootcamp/lib/python3.10/site-packages/skl

In [131]:
print(grid_result6.best_score_)
print(grid_result6.best_params_)

df = pd.DataFrame(grid_result6.cv_results_)
df[df["param_max_depth"] == 6][df['param_min_samples_leaf'] == 1][df['param_min_samples_split'] == 3]

0.638272595544547
{'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 3}


  df[df["param_max_depth"] == 6][df['param_min_samples_leaf'] == 1][df['param_min_samples_split'] == 3]
  df[df["param_max_depth"] == 6][df['param_min_samples_leaf'] == 1][df['param_min_samples_split'] == 3]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
62,0.187108,0.005882,0.010112,0.000867,6,1,3,"{'max_depth': 6, 'min_samples_leaf': 1, 'min_s...",0.651548,0.614355,...,0.638273,0.024974,1,0.694956,0.689709,0.698096,0.681156,0.691518,0.691087,0.005741


In [216]:
model = RandomForestClassifier(random_state=123, min_samples_split=2,min_samples_leaf=1, n_estimators=500, max_depth=40)
rfe = RFE(model)
fit = rfe.fit(X_train, y_train)

In [218]:
y_pred = fit.predict_proba(X_validation)[:,1]
roc_auc_score(y_validation, y_pred)

0.693551211685402

In [229]:
filename = 'random_forest_RFE.pickle'
pickle.dump(fit, open(filename, 'wb'))

In [224]:
model = xgb.XGBClassifier(random_state=123, eta=0.1, gamma=6, max_depth=6)
rfe = RFE(model)
XBG_RFE = rfe.fit(X_train, y_train)

In [226]:
y_predict_validation = XBG_RFE.predict_proba(X_validation)[:,1]
print(roc_auc_score(y_validation, y_predict_validation))

0.6937772103352803


In [228]:
filename = 'XGBoost_RFE.pickle'
pickle.dump(fit, open(filename, 'wb'))

In [230]:
# Voting classifier 

from sklearn.ensemble import VotingClassifier

model1 = XBG_RFE
model2 = fit

model = VotingClassifier(estimators=[('xgb', model1), ('rf', model2)], voting='soft')
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_validation)[:,1]
roc_auc_score(y_validation, y_pred)

0.7106718323500925

In [232]:
# save model to pickle

filename = 'voting_classifier.pickle'
pickle.dump(model, open(filename, 'wb'))

In [231]:
y_test_pred = model.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_test_pred)

0.7212396890593706