# Importation des modules

In [240]:
# Pour manipuler et visualiser les données
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pour séparer et évaluer les données
from sklearn.model_selection import train_test_split, cross_validate, KFold, learning_curve, GridSearchCV, RandomizedSearchCV

# Pour préparer les données
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Pour créer des arbres de classification
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

# Pour utiliser les métriques
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Pour exporter notre modèle
import pickle

# Création du dataset et Dummy Classifier

On commence par créer un dataset avec les variables :
<p style='color: #FFA07A'> NAICS, NoEmp, NewExist, FranchiseCode,  UrbanRural, CreateJob, RetainedJob, MIS_Status </p>

In [241]:
df = pd.read_csv("archive/SBAnational_clean.csv")

  df = pd.read_csv("archive/SBAnational_clean.csv")


In [242]:
# Dataframe contenant les variables explicatives
X = df[['NAICS', 'NoEmp',  'RetainedJob', 'UrbanRural','Term']]

# On transforme les valeurs de MIS_Status en 0 ou 1
y = df['MIS_Status'].astype('category').cat.codes

On crée maintenant notre jeu de données d'entraînement et de test :

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42, stratify=y)

On sépare les variables numériques et catégorielle :

In [244]:
var_num = ['NAICS', 'NoEmp', 'RetainedJob', 'UrbanRural',"Term"]


Puis on crée un transformateur de colonne :

In [245]:
preprocessor = make_column_transformer(
    (StandardScaler(), var_num)
)

On commence notre modélisation par un Dummy Classifier qui servira de point de comparaison :

In [246]:
dummy = make_pipeline(preprocessor, DummyClassifier())

dummy.fit(X_train, y_train)

y_pred = dummy.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

precision = precision_score(y_test, y_pred)
print("Precision: ", precision)

recall = recall_score(y_test, y_pred)
print("Recall: ", recall)

f1 = f1_score(y_test, y_pred)
print("F1-score: ", f1)

Accuracy:  0.8237819517017371
Precision:  0.8237819517017371
Recall:  1.0
F1-score:  0.9033776772854687


In [247]:
from sklearn.metrics import classification_report

# Make predictions on the test set

# Generate the classification report
dummy_report = (classification_report(y_test, y_pred))
print(dummy_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     31195
           1       0.82      1.00      0.90    145830

    accuracy                           0.82    177025
   macro avg       0.41      0.50      0.45    177025
weighted avg       0.68      0.82      0.74    177025



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest Classifier

Nous allons maintenant utiliser un Random Forest Classififier sans paramètres pour le comparer avec notre Dummy Classifier :

In [248]:
# Entraînement du modèle
rf_pipe = make_pipeline(preprocessor, RandomForestClassifier())
rf_pipe.fit(X_train, y_train)

On affiche l'importance des features :

In [249]:
importances = rf_pipe[-1].feature_importances_
features = pd.DataFrame({"feature": X_train.columns, "importance": importances})
features

Unnamed: 0,feature,importance
0,NAICS,0.18079
1,NoEmp,0.060546
2,RetainedJob,0.048282
3,UrbanRural,0.032524
4,Term,0.677858


In [250]:
y_pred = rf_pipe.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

precision = precision_score(y_test, y_pred)
print("Precision: ", precision)

recall = recall_score(y_test, y_pred)
print("Recall: ", recall)

f1 = f1_score(y_test, y_pred)
print("F1-score: ", f1)

Accuracy:  0.9174579861601468
Precision:  0.9439219454105038
Recall:  0.9566344373585682
F1-score:  0.9502356755578564


In [None]:
print(classification_report(y_test, y_pred))

En utilisant un Random Forest Classifier sans paramètres, on obtient de bons résultats, proches de notre Dummy Classifier. <br>
On va essayer d'affiner notre modèle avec une cross-validation puis en cherchant les meilleurs hyperparamètres :

In [None]:
# scoring = {'recall': 'recall', 'f1': 'f1', 'precision': 'precision', 'accuracy': 'accuracy'}
# cross_val = cross_validate(rf_pipe, X_train, y_train, cv=5, scoring=scoring)

In [None]:
# # On affiche les différentes métriques avec leur écart-type
# print("Accuracy: %0.3f (+/- %0.3f)" % (cross_val['test_accuracy'].mean(), cross_val['test_accuracy'].std() * 2))
# print("Precision: %0.3f (+/- %0.3f)" % (cross_val['test_precision'].mean(), cross_val['test_precision'].std() * 2))
# print("Recall: %0.3f (+/- %0.3f)" % (cross_val['test_recall'].mean(), cross_val['test_recall'].std() * 2))
# print("F1: %0.3f (+/- %0.3f)" % (cross_val['test_f1'].mean(), cross_val['test_f1'].std() * 2))

Les résultats obtenus sont quasiment les mêmes que ceux obtenus sans cross validation. On recherche maintenant des hyperparamètres susceptibles d'améliorer le modèle :

In [None]:
""" # # On définit les paramètres à rechercher avec un intervalle de recherche
param_dist = {'n_estimators': np.arange(20, 25, 5), 
              'max_depth': np.arange(1, 11),
      'min_samples_split': np.arange(2, 10),
               'min_samples_leaf': np.arange(1, 10),
               'criterion': ['gini']}

 # On crée un objet Random Search 
random_search = RandomizedSearchCV(rf_pipe[-1], param_distributions=param_dist,
                                    n_iter=50, cv=5, n_jobs=3)

 # On entrâine le modèle
random_search.fit(X_train, y_train)  """

In [None]:
 # On récupère les meilleurs hyperparamètres trouvés
best_params = random_search.best_params_

In [None]:
print(best_params)

'columntransformer__standardscaler__with_std': True,
 'randomforestclassifier__bootstrap': True,
  'randomforestclassifier__ccp_alpha': 0.0,
   'randomforestclassifier__class_weight': None,
    'randomforestclassifier__criterion': 'gini', 
    'randomforestclassifier__max_depth': None,
     'randomforestclassifier__max_features': 'sqrt', max_leaf_nodes': None,
      'randomforestclassifier__max_samples': None,
       'randomforestclassifier__min_impurity_decrease': 0.0,
        'randomforestclassifier__min_samples_leaf': 1,
         'randomforestclassifier__min_samples_split': 2,
          'randomforestclassifier__min_weight_fraction_leaf': 0.0,
           'randomforestclassifier__n_estimators': 100, 'randomforestclassifier__n_jobs': None,
            'randomforestclassifier__oob_score': False, 'randomforestclassifier__random_state': None,
             'randomforestclassifier__verbose': 0, 'randomforestclassifier__warm_start': False}

In [261]:
# Entraînement du modèle
rf_pipe = make_pipeline(preprocessor, RandomForestClassifier(**{
    'n_estimators': 100,
     'min_samples_split': 2,
      'min_samples_leaf': 1,
       'max_depth': None,
        'criterion': 'gini' }))
rf_pipe.fit(X_train, y_train)

In [262]:
""" y_pred = rf_pipe.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

precision = precision_score(y_test, y_pred)
print("Precision: ", precision)

recall = recall_score(y_test, y_pred)
print("Recall: ", recall)

f1 = f1_score(y_test, y_pred)
print("F1-score: ", f1) """

' y_pred = rf_pipe.predict(X_test)\n\naccuracy = accuracy_score(y_test, y_pred)\nprint("Accuracy: ", accuracy)\n\nprecision = precision_score(y_test, y_pred)\nprint("Precision: ", precision)\n\nrecall = recall_score(y_test, y_pred)\nprint("Recall: ", recall)\n\nf1 = f1_score(y_test, y_pred)\nprint("F1-score: ", f1) '

In [263]:
from sklearn.metrics import classification_report

# Make predictions on the test set


# Generate the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.73      0.76     31195
           1       0.94      0.96      0.95    145830

    accuracy                           0.92    177025
   macro avg       0.86      0.85      0.85    177025
weighted avg       0.92      0.92      0.92    177025



Cherchons autour de cette zone , 

In [None]:
""" # # On définit les paramètres à rechercher avec un intervalle de recherche
param_dist = {'n_estimators': np.arange(20, 25, 5), 
              'max_depth': np.arange(1, 100),
      'min_samples_split': np.arange(2, 10),
               'min_samples_leaf': np.arange(1, 10),
               'criterion': ['gini']}

 # On crée un objet Random Search 
random_search = RandomizedSearchCV(rf_pipe[-1], param_distributions=param_dist,
                                    n_iter=50, cv=5, n_jobs=3)

 # On entrâine le modèle
random_search.fit(X_train, y_train)  """