# Importation des modules

In [262]:
# Pour manipuler et visualiser les données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pour séparer et évaluer les données
from sklearn.model_selection import train_test_split, cross_validate, KFold, learning_curve, GridSearchCV, RandomizedSearchCV

# Pour préparer les données
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Pour créer des arbres de classification
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

# Pour faire un modèle de régression logistique 
from sklearn.linear_model import LogisticRegression

# Pour utiliser xgboost
import xgboost as xgb

# Pour utiliser les métriques
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Pour exporter notre modèle
import pickle

# Création du dataset et Dummy Classifier

On commence par créer un dataset avec les variables :
<p style='color: #FFA07A'> NAICS, NoEmp, RetainedJob, UrbanRural, Term, UrbanRural, MIS_Status </p>

In [263]:
df = pd.read_csv("archive/SBAnational_clean.csv")

  df = pd.read_csv("archive/SBAnational_clean.csv")


In [264]:
df['NAICS'] = df['NAICS'].astype(str).apply(lambda x : x[:2])
df[['NAICS']] = df[['NAICS']].astype(int)

In [265]:
# Dataframe contenant les variables explicatives
X = df[['NAICS', 'NoEmp',  'RetainedJob', 'UrbanRural','Term']]

# On transforme les valeurs de MIS_Status en 0 ou 1
y = df['MIS_Status'].astype('category').cat.codes

On crée maintenant notre jeu de données d'entraînement et de test :

In [266]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42, stratify=y)

On sépare les variables numériques et catégorielle :

In [267]:
var_num = ['NoEmp','RetainedJob','Term']

var_cat = ['NAICS', 'UrbanRural']

Puis on crée un transformateur de colonne :

In [268]:
preprocessor = make_column_transformer(
    (StandardScaler(), var_num),
    (OneHotEncoder(), var_cat)
)

On commence notre modélisation par un Dummy Classifier qui servira de point de comparaison :

In [269]:
dummy = make_pipeline(preprocessor, DummyClassifier())

dummy.fit(X_train, y_train)

y_pred = dummy.predict(X_test)

dummy_report = (classification_report(y_test, y_pred))
print(dummy_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     31195
           1       0.82      1.00      0.90    145830

    accuracy                           0.82    177025
   macro avg       0.41      0.50      0.45    177025
weighted avg       0.68      0.82      0.74    177025



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest Classifier

Nous allons maintenant utiliser un Random Forest Classififier sans paramètres pour le comparer avec notre Dummy Classifier :

In [270]:
# Entraînement du modèle
rf_pipe = make_pipeline(preprocessor, RandomForestClassifier())
rf_pipe.fit(X_train, y_train)

On affiche l'importance des features :

In [271]:
importances = rf_pipe[-1].feature_importances_
importances
# features = pd.DataFrame({"feature": X_train.columns, "importance": importances})
# features

array([6.83181909e-02, 5.60392210e-02, 8.03281105e-01, 7.53226855e-03,
       1.46009933e-03, 4.44540750e-04, 1.98061515e-04, 1.73356505e-03,
       9.61547173e-04, 1.07031528e-03, 1.72235142e-03, 1.84008272e-03,
       1.71667905e-03, 1.43854913e-03, 1.43877014e-03, 3.80181446e-04,
       8.62659048e-04, 8.32988074e-04, 1.21123662e-03, 1.58985365e-03,
       4.29154195e-05, 1.19002597e-03, 6.84663850e-04, 3.10485523e-03,
       1.01089066e-03, 2.70614307e-03, 1.72562163e-03, 7.11977225e-05,
       1.88969466e-02, 1.37922902e-02, 2.70218401e-03])

In [272]:
rf_pipe.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['NoEmp', 'RetainedJob',
                                                   'Term']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['NAICS', 'UrbanRural'])])),
                ('randomforestclassifier', RandomForestClassifier())])>

In [273]:
y_pred = rf_pipe.predict(X_test)

rfc_report = classification_report(y_test, y_pred)
print(rfc_report)

              precision    recall  f1-score   support

           0       0.78      0.72      0.75     31195
           1       0.94      0.96      0.95    145830

    accuracy                           0.92    177025
   macro avg       0.86      0.84      0.85    177025
weighted avg       0.91      0.92      0.91    177025



En utilisant un Random Forest Classifier sans paramètres, on obtient de bons résultats, proches de notre Dummy Classifier pour les classes positives. <br>
On va essayer d'affiner notre modèle avec en cherchant les meilleurs hyperparamètres :

In [276]:
# On définit les paramètres à rechercher avec un intervalle de recherche
param_dist = {'n_estimators': np.arange(30, 100, 1), 
              'max_depth': np.arange(1, 11),
              'min_samples_split': np.arange(2, 10),
              'min_samples_leaf': np.arange(1, 10),
              'criterion': ['gini','entropy']}

# On crée un objet Random Search 
random_search = RandomizedSearchCV(rf_pipe[-1], param_distributions=param_dist,
                                  cv=5, n_jobs=-1, verbose=1)

# On entrâine le modèle
random_search.fit(X_train, y_train)

# On récupère les meilleurs hyperparamètres trouvés
best_params = random_search.best_params_
print(best_params)

# Entraînement du modèle
rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(**best_params, class_weight="balanced"))
rf_pipe.fit(X_train, y_train)

y_pred = rf_pipe.predict(X_test)

print(f"resultats après grid search :\n {classification_report(y_test, y_pred)}")
print(f"resultats avant grid search :\n {rfc_report}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 34, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_depth': 9, 'criterion': 'gini'}
resultats après grid search :               precision    recall  f1-score   support

           0       0.53      0.81      0.64     31195
           1       0.95      0.84      0.90    145830

    accuracy                           0.84    177025
   macro avg       0.74      0.83      0.77    177025
weighted avg       0.88      0.84      0.85    177025

resultats avant grid search :               precision    recall  f1-score   support

           0       0.78      0.72      0.75     31195
           1       0.94      0.96      0.95    145830

    accuracy                           0.92    177025
   macro avg       0.86      0.84      0.85    177025
weighted avg       0.91      0.92      0.91    177025



# Régression logistique

In [277]:
# Entraînement du modèle
rf_pipe_log = make_pipeline(preprocessor, LogisticRegression())
rf_pipe_log.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [278]:
y_pred = rf_pipe_log.predict(X_test)

log_report = classification_report(y_test, y_pred)
print(log_report)

              precision    recall  f1-score   support

           0       0.68      0.20      0.31     31195
           1       0.85      0.98      0.91    145830

    accuracy                           0.84    177025
   macro avg       0.76      0.59      0.61    177025
weighted avg       0.82      0.84      0.81    177025



In [279]:
# #On définit les paramètres à rechercher avec un intervalle de recherche
param_dist = {'C': np.logspace(-4, 4, 20),
              'penalty': ['l1', 'l2', 'elasticnet'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

# On crée un objet Random Search 
random_search = RandomizedSearchCV(rf_pipe_log[-1], param_distributions=param_dist,
                                   n_iter=50, cv=5, n_jobs=-1, verbose=1)

# On entrâine le modèle
random_search.fit(X_train, y_train) 

# On récupère les meilleurs hyperparamètres trouvés
best_params = random_search.best_params_
print(best_params)

# Entraînement du modèle
rf_pipe_log_opti = make_pipeline(preprocessor, LogisticRegression(**best_params, class_weight="balanced"))
rf_pipe_log_opti.fit(X_train, y_train)

y_pred = rf_pipe_log_opti.predict(X_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


135 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/apprenant/Documents/Projet_Classification/env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/apprenant/Documents/Projet_Classification/env/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/apprenant/Documents/Projet_Classification/env/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError

{'solver': 'liblinear', 'penalty': 'l2', 'C': 0.0018329807108324356}
resultats après grid search :               precision    recall  f1-score   support

           0       0.32      0.83      0.46     31195
           1       0.94      0.62      0.75    145830

    accuracy                           0.66    177025
   macro avg       0.63      0.73      0.61    177025
weighted avg       0.83      0.66      0.70    177025

resultats avant grid search :               precision    recall  f1-score   support

           0       0.68      0.20      0.31     31195
           1       0.85      0.98      0.91    145830

    accuracy                           0.84    177025
   macro avg       0.76      0.59      0.61    177025
weighted avg       0.82      0.84      0.81    177025



In [None]:
print(f"resultats après grid search :\n {classification_report(y_test, y_pred)}")
print(f"resultats avant grid search :\n {log_report}")

# XGBoost

In [280]:
# Entraînement du modèle
xgb_pipe = make_pipeline(preprocessor, xgb.XGBClassifier())
xgb_pipe.fit(X_train, y_train)

In [281]:
y_pred = xgb_pipe.predict(X_test)

xgb_report = classification_report(y_test, y_pred)
print(xgb_report)

              precision    recall  f1-score   support

           0       0.82      0.78      0.80     31195
           1       0.95      0.96      0.96    145830

    accuracy                           0.93    177025
   macro avg       0.89      0.87      0.88    177025
weighted avg       0.93      0.93      0.93    177025



In [1]:
xgb_pipe.get_params()

NameError: name 'xgb_pipe' is not defined

In [282]:
param_dist = {'n_estimators': np.arange(20, 50, 5), 
              'learning_rate': np.linspace(0.01, 1, 20),
              'max_depth': np.arange(1, 11),
              'subsample': np.linspace(0.1, 1, 10),
              'colsample_bytree': np.linspace(0.1, 1, 10)
             }

random_search = RandomizedSearchCV(xgb_pipe, param_distributions=param_dist,
                                   n_iter=50, cv=5, n_jobs=-1, verbose=1)

random_search.fit(X_train, y_train)

# On récupère les meilleurs hyperparamètres trouvés
best_params = random_search.best_params_
print(best_params)

# Entraînement du modèle
rf_pipe_log_opti = make_pipeline(preprocessor, xgb.XGBClassifier(**best_params, class_weight="balanced"))
rf_pipe_log_opti.fit(X_train, y_train)

y_pred = rf_pipe_log_opti.predict(X_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'subsample': 0.9, 'n_estimators': 20, 'max_depth': 8, 'learning_rate': 0.6352631578947369, 'colsample_bytree': 1.0}
Parameters: { "class_weight" } are not used.

resultats après grid search :               precision    recall  f1-score   support

           0       0.82      0.77      0.80     31195
           1       0.95      0.96      0.96    145830

    accuracy                           0.93    177025
   macro avg       0.89      0.87      0.88    177025
weighted avg       0.93      0.93      0.93    177025

resultats avant grid search :               precision    recall  f1-score   support

           0       0.82      0.78      0.80     31195
           1       0.95      0.96      0.96    145830

    accuracy                           0.93    177025
   macro avg       0.89      0.87      0.88    177025
weighted avg       0.93      0.93      0.93    177025



In [None]:
print(f"resultats après grid search : {classification_report(y_test, y_pred)}")
print(f"resultats avant grid search : {xgb_report}")