In [49]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from sklearn.ensemble import RandomForestClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')

In [50]:
df.dropna(inplace=True)

In [51]:
df.shape

(675026, 13)

In [52]:
generate_description(df)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[54, 23, 42]"
1,Term,int64,0,0.0,377,"[109, 84, 31]"
2,NoEmp,int64,0,0.0,499,"[1, 4, 7]"
3,NewExist,float64,0,0.0,2,"[2.0, 1.0]"
4,CreateJob,int64,0,0.0,216,"[0, 15]"
5,RetainedJob,int64,0,0.0,323,"[2, 7, 38, 1]"
6,UrbanRural,int64,0,0.0,3,"[1, 0]"
7,RevLineCr,object,0,0.0,2,"[N, Y]"
8,LowDoc,object,0,0.0,2,[N]
9,GrAppv,int64,0,0.0,16880,"[150000, 142200, 205000, 50000]"


# RandomForest avec * features du csv_clean_0

In [53]:
y = df['Approve']
X = df.drop('Approve', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [54]:
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns

In [55]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns_selector),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [56]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [57]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [58]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [59]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [60]:
y_pred = best_model.predict(X_test)

In [61]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred))

Accuracy : 0.9343742593031524
Score F1 : 0.8968982747295433
              precision    recall  f1-score   support

       False       0.84      0.83      0.83      6714
        True       0.96      0.96      0.96     27038

    accuracy                           0.93     33752
   macro avg       0.90      0.90      0.90     33752
weighted avg       0.93      0.93      0.93     33752



# RandomForest Sans JobCreated & JobRetained

In [62]:
maybe_leaking_feature = ['RetainedJob', 'CreateJob']
df = df.drop(maybe_leaking_feature, axis=1)

In [63]:
generate_description(df)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[56, 51, 23, 44-45]"
1,Term,int64,0,0.0,377,"[60, 83, 120, 84]"
2,NoEmp,int64,0,0.0,499,"[8, 5, 1]"
3,NewExist,float64,0,0.0,2,[1.0]
4,UrbanRural,int64,0,0.0,3,"[1, 0]"
5,RevLineCr,object,0,0.0,2,"[N, Y]"
6,LowDoc,object,0,0.0,2,[N]
7,GrAppv,int64,0,0.0,16880,"[425000, 100000, 50000, 145000, 97000]"
8,SBA_Appv,int64,0,0.0,28240,"[24000, 17500, 443775, 202800, 72000]"
9,Franchise,int64,0,0.0,2,[0]


In [64]:
y = df['Approve']
X = df.drop('Approve', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [65]:
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns_selector),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [67]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [68]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [69]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [70]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [71]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [72]:
y_pred = best_model.predict(X_test)

In [73]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred))

Accuracy : 0.933278027968713
Score F1 : 0.8954631831168669
              precision    recall  f1-score   support

       False       0.83      0.83      0.83      6714
        True       0.96      0.96      0.96     27038

    accuracy                           0.93     33752
   macro avg       0.89      0.90      0.90     33752
weighted avg       0.93      0.93      0.93     33752



# RandomForest sans * MAYBE feature

In [74]:
maybe_leaking_fea = ['SBA_Appv', 'GrAppv']
df = df.drop(maybe_leaking_fea, axis=1)

In [75]:
generate_description(df)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[42, 31-33, 72, 62]"
1,Term,int64,0,0.0,377,"[87, 84, 240]"
2,NoEmp,int64,0,0.0,499,"[20, 3, 52, 5, 51]"
3,NewExist,float64,0,0.0,2,"[2.0, 1.0]"
4,UrbanRural,int64,0,0.0,3,"[1, 2]"
5,RevLineCr,object,0,0.0,2,"[N, Y]"
6,LowDoc,object,0,0.0,2,[N]
7,Franchise,int64,0,0.0,2,[0]
8,Approve,bool,0,0.0,2,[True]


In [76]:
y = df['Approve']
X = df.drop('Approve', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [77]:
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns

In [78]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns_selector),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [79]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [80]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [81]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [82]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [83]:
y_pred = best_model.predict(X_test)

In [84]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred))

Accuracy : 0.928152405783361
Score F1 : 0.8881241108183575
              precision    recall  f1-score   support

       False       0.81      0.83      0.82      6714
        True       0.96      0.95      0.96     27038

    accuracy                           0.93     33752
   macro avg       0.89      0.89      0.89     33752
weighted avg       0.93      0.93      0.93     33752

