# Importation des modules

In [128]:
# Pour manipuler et visualiser les données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pour séparer et évaluer les données
from sklearn.model_selection import train_test_split, cross_validate, KFold, learning_curve, GridSearchCV, RandomizedSearchCV

# Pour préparer les données
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Pour créer des arbres de classification
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

# Pour faire un modèle de régression logistique 
from sklearn.linear_model import LogisticRegression

# Pour utiliser xgboost
import xgboost as xgb

# Pour utiliser les métriques
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Pour exporter notre modèle
import pickle

# Création du dataset et Dummy Classifier

On commence par créer un dataset avec les variables :
<p style='color: #FFA07A'> NAICS, NoEmp, RetainedJob, UrbanRural, Term, UrbanRural, MIS_Status </p>

In [129]:
df = pd.read_csv("archive/SBAnational_clean.csv")

  df = pd.read_csv("archive/SBAnational_clean.csv")


In [130]:
df['NAICS'] = df['NAICS'].astype(str).apply(lambda x : x[:2])
df[['NAICS']] = df[['NAICS']].astype(int)

In [131]:
df["NAICS"].unique()

array([45, 72, 62,  0, 33, 81, 23, 44, 42, 61, 53, 54, 31, 51, 71, 52, 21,
       32, 56, 48, 11, 92, 22, 49, 55])

In [132]:
# Dataframe contenant les variables explicatives
X = df[['NAICS', 'NoEmp',  'RetainedJob', 'UrbanRural','Term']]

# On transforme les valeurs de MIS_Status en 0 ou 1
y = df['MIS_Status'].astype('category').cat.codes

In [133]:
y.value_counts()

1    729148
0    155977
dtype: int64

On crée maintenant notre jeu de données d'entraînement et de test :

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42, stratify=y)

On sépare les variables numériques et catégorielle :

In [135]:
var_num = ['NoEmp','RetainedJob','Term']

var_cat = ['NAICS', 'UrbanRural']

Puis on crée un transformateur de colonne :

In [136]:
preprocessor = make_column_transformer(
    (StandardScaler(), var_num),
    (OneHotEncoder(), var_cat)
)

On commence notre modélisation par un Dummy Classifier qui servira de point de comparaison :

In [137]:
dummy = make_pipeline(preprocessor, DummyClassifier())

dummy.fit(X_train, y_train)

y_pred = dummy.predict(X_test)

dummy_report = (classification_report(y_test, y_pred))
print(dummy_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     31195
           1       0.82      1.00      0.90    145830

    accuracy                           0.82    177025
   macro avg       0.41      0.50      0.45    177025
weighted avg       0.68      0.82      0.74    177025



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest Classifier

Nous allons maintenant utiliser un Random Forest Classififier sans paramètres pour le comparer avec notre Dummy Classifier :

In [138]:
# # Entraînement du modèle
# rf_pipe = make_pipeline(preprocessor, RandomForestClassifier())
# rf_pipe.fit(X_train, y_train)

On affiche l'importance des features :

In [139]:
# importances = rf_pipe[-1].feature_importances_
# importances
# # features = pd.DataFrame({"feature": X_train.columns, "importance": importances})
# # features

In [140]:
# rf_pipe.get_feature_names_out

In [141]:
# y_pred = rf_pipe.predict(X_test)

# rfc_report = classification_report(y_test, y_pred)
# print(rfc_report)

En utilisant un Random Forest Classifier sans paramètres, on obtient de bons résultats, proches de notre Dummy Classifier pour les classes positives. <br>
On va essayer d'affiner notre modèle avec en cherchant les meilleurs hyperparamètres :

In [142]:
# # On définit les paramètres à rechercher avec un intervalle de recherche
# param_dist = {'n_estimators': np.arange(30, 100, 1), 
#               'max_depth': np.arange(1, 11),
#               'min_samples_split': np.arange(2, 10),
#               'min_samples_leaf': np.arange(1, 10),
#               'criterion': ['gini','entropy']}

# # On crée un objet Random Search 
# random_search = RandomizedSearchCV(rf_pipe[-1], param_distributions=param_dist,
#                                   cv=5, n_jobs=-1, verbose=1)

# # On entrâine le modèle
# random_search.fit(X_train, y_train)

# # On récupère les meilleurs hyperparamètres trouvés
# best_params = random_search.best_params_
# print(best_params)

# # Entraînement du modèle
# rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(**best_params, class_weight="balanced"))
# rf_pipe.fit(X_train, y_train)

# y_pred = rf_pipe.predict(X_test)

# print(f"resultats après grid search :\n {classification_report(y_test, y_pred)}")
# print(f"resultats avant grid search :\n {rfc_report}")

# Régression logistique

In [143]:
# # Entraînement du modèle
# rf_pipe_log = make_pipeline(preprocessor, LogisticRegression())
# rf_pipe_log.fit(X_train, y_train)

In [144]:
# y_pred = rf_pipe_log.predict(X_test)

# log_report = classification_report(y_test, y_pred)
# print(log_report)

In [145]:
# # #On définit les paramètres à rechercher avec un intervalle de recherche
# param_dist = {'C': np.logspace(-4, 4, 20),
#               'penalty': ['l1', 'l2', 'elasticnet'],
#               'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

# # On crée un objet Random Search 
# random_search = RandomizedSearchCV(rf_pipe_log[-1], param_distributions=param_dist,
#                                    n_iter=50, cv=5, n_jobs=-1, verbose=1)

# # On entrâine le modèle
# random_search.fit(X_train, y_train) 

# # On récupère les meilleurs hyperparamètres trouvés
# best_params = random_search.best_params_
# print(best_params)

# # Entraînement du modèle
# rf_pipe_log_opti = make_pipeline(preprocessor, LogisticRegression(**best_params, class_weight="balanced"))
# rf_pipe_log_opti.fit(X_train, y_train)

# y_pred = rf_pipe_log_opti.predict(X_test)

In [146]:
# print(f"resultats après grid search :\n {classification_report(y_test, y_pred)}")
# print(f"resultats avant grid search :\n {log_report}")

# XGBoost

In [147]:
# Entraînement du modèle
xgb_pipe = make_pipeline(preprocessor, xgb.XGBClassifier(class_weight="balanced"))
xgb_pipe.fit(X_train, y_train)

Parameters: { "class_weight" } are not used.



In [148]:
y_pred = xgb_pipe.predict(X_test)

xgb_report = classification_report(y_test, y_pred)
print(xgb_report)

              precision    recall  f1-score   support

           0       0.82      0.78      0.80     31195
           1       0.95      0.96      0.96    145830

    accuracy                           0.93    177025
   macro avg       0.89      0.87      0.88    177025
weighted avg       0.93      0.93      0.93    177025



In [149]:
xgb_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['NoEmp', 'RetainedJob', 'Term']),
                                   ('onehotencoder', OneHotEncoder(),
                                    ['NAICS', 'UrbanRural'])])),
  ('xgbclassifier',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 class_weight='balanced', colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
                 grow_policy=None, importance_type=None,
                 interaction_constraints=None, learning_rate=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=

In [150]:
param_dist = {'xgbclassifier__n_estimators': np.arange(90, 100, 1), 
              'xgbclassifier__learning_rate': np.linspace(0.01, 1, 20),
              'xgbclassifier__max_depth': np.arange(1, 11),
              'xgbclassifier__subsample': np.linspace(0.1, 1, 10),
              'xgbclassifier__colsample_bytree': np.linspace(0.1, 1, 10)
             }

random_search = RandomizedSearchCV(xgb_pipe, param_distributions=param_dist,
                                   n_iter=50, cv=5, n_jobs=-1, verbose=2)

random_search.fit(X_train, y_train)

# On récupère les meilleurs hyperparamètres trouvés
best_params = random_search.best_params_

# Entraînement du modèle
xgb_pipe_opti = make_pipeline(preprocessor, xgb.XGBClassifier(**best_params))
xgb_pipe_opti.fit(X_train, y_train)

y_pred = xgb_pipe_opti.predict(X_test)
xgb_opti_report = classification_report(y_test, y_pred)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

[CV] END xgbclassifier__colsample_bytree=0.1, xgbclassifier__learning_rate=0.21842105263157896, xgbclassifier__max_depth=8, xgbclassifier__n_estimators=94, xgbclassifier__subsample=0.7000000000000001; total time=  23.9s
[CV] END xgbclassifier__colsample_bytree=0.1, xgbclassifier__learning_rate=0.21842105263157896, xgbclassifier__max_depth=8, xgbclassifier__n_estimators=94, xgbclassifier__subsample=0.7000000000000001; total time=  24.2s
[CV] END xgbclassifier__colsample_bytree=0.1, xgbclassifier__learning_rate=0.21842105263157896, xgbclassifier__max_depth=8, xgbcla

In [153]:
print(best_params)

{'xgbclassifier__subsample': 0.9, 'xgbclassifier__n_estimators': 97, 'xgbclassifier__max_depth': 7, 'xgbclassifier__learning_rate': 0.32263157894736844, 'xgbclassifier__colsample_bytree': 0.6}


In [151]:
print(f"resultats après random search :\n {xgb_opti_report}")
print(f"resultats avant random search :\n {xgb_report}")

resultats après random search :
               precision    recall  f1-score   support

           0       0.82      0.78      0.80     31195
           1       0.95      0.96      0.96    145830

    accuracy                           0.93    177025
   macro avg       0.89      0.87      0.88    177025
weighted avg       0.93      0.93      0.93    177025

resultats avant random search :
               precision    recall  f1-score   support

           0       0.82      0.78      0.80     31195
           1       0.95      0.96      0.96    145830

    accuracy                           0.93    177025
   macro avg       0.89      0.87      0.88    177025
weighted avg       0.93      0.93      0.93    177025



# Modèle au format pkl

In [154]:
with open('xgb_pipe.pkl', 'wb') as f:
    pickle.dump(xgb_pipe, f)

In [168]:
pd.DataFrame(np.array([61, 500, 50, 1, 20]).reshape(1, 5), columns=['NAICS', 'NoEmp',  'RetainedJob', 'UrbanRural','Term'])

Unnamed: 0,NAICS,NoEmp,RetainedJob,UrbanRural,Term
0,61,500,50,1,20


In [180]:
xgb_pipe.predict(pd.DataFrame([[11, 500, 50, 1, 20]], columns=['NAICS', 'NoEmp',  'RetainedJob', 'UrbanRural','Term']))

array([0])