In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn import set_config

from data_to_csv import df_to_csv
from utils import generate_description

df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')

In [2]:
df_1.dropna(inplace=True)

In [3]:
df_1.shape

(460816, 13)

# RandomForest avec * features du csv_clean_1

In [4]:
generate_description(df_1)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[44-45, 42, 62, 31-33, 72]"
1,Term,int64,0,0.0,373,"[240, 84, 47, 60, 180]"
2,NoEmp,int64,0,0.0,432,"[32, 6, 2, 14, 3500]"
3,NewExist,float64,0,0.0,2,"[1.0, 2.0]"
4,CreateJob,int64,0,0.0,192,"[0, 6, 4, 1]"
5,RetainedJob,int64,0,0.0,276,"[3, 2, 1, 20]"
6,FranchiseCode,int64,0,0.0,2,"[1, 0]"
7,UrbanRural,int64,0,0.0,3,"[1, 0]"
8,RevLineCr,object,0,0.0,2,"[Y, N]"
9,LowDoc,object,0,0.0,2,[N]


In [5]:
y = df_1['Approve']
X = df_1.drop('Approve', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [6]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

In [7]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [8]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [9]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [10]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [11]:
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

Meilleurs hyperparamètres trouvés :
{'model__max_depth': 25, 'model__max_features': None, 'model__max_samples': 0.7, 'model__min_samples_leaf': 80, 'model__min_samples_split': 3, 'model__n_estimators': 70}
Meilleure précision trouvée : 0.903349400100861


In [12]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [13]:
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)

F1-score macro sur l'ensemble de test : 0.9055009977133641


# RandomForest Sans JobCreated & JobRetained

In [14]:
maybe_leaking_feature = ['RetainedJob', 'CreateJob']
df_1 = df_1.drop(maybe_leaking_feature, axis=1)

In [15]:
generate_description(df_1)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[44-45, 54, 71, 81, 42]"
1,Term,int64,0,0.0,373,"[84, 60, 19]"
2,NoEmp,int64,0,0.0,432,"[2, 11, 1, 0]"
3,NewExist,float64,0,0.0,2,"[1.0, 2.0]"
4,FranchiseCode,int64,0,0.0,2,"[1, 0]"
5,UrbanRural,int64,0,0.0,3,"[1, 2]"
6,RevLineCr,object,0,0.0,2,"[N, Y]"
7,LowDoc,object,0,0.0,2,[N]
8,GrAppv,int64,0,0.0,12824,"[30000, 100000, 1333300, 40000, 10000]"
9,SBA_Appv,int64,0,0.0,22309,"[16000, 40000, 10000, 331406, 5000]"


In [16]:
y = df_1['Approve']
X = df_1.drop('Approve', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [17]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

In [18]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [19]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [20]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [21]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [22]:
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

Meilleurs hyperparamètres trouvés :
{'model__max_depth': 25, 'model__max_features': None, 'model__max_samples': 0.7, 'model__min_samples_leaf': 80, 'model__min_samples_split': 3, 'model__n_estimators': 70}
Meilleure précision trouvée : 0.9032996536636932


In [23]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [24]:
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)

F1-score macro sur l'ensemble de test : 0.9057997796101414


# RandomForest sans * MAYBE feature

In [25]:
maybe_leaking_fea = ['SBA_Appv', 'GrAppv']
df_1 = df_1.drop(maybe_leaking_fea, axis=1)

In [26]:
generate_description(df_1)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[44-45, 48-49, 42]"
1,Term,int64,0,0.0,373,"[60, 240, 84]"
2,NoEmp,int64,0,0.0,432,"[4, 2, 3, 1]"
3,NewExist,float64,0,0.0,2,"[1.0, 2.0]"
4,FranchiseCode,int64,0,0.0,2,"[1, 0]"
5,UrbanRural,int64,0,0.0,3,[1]
6,RevLineCr,object,0,0.0,2,"[Y, N]"
7,LowDoc,object,0,0.0,2,[N]
8,Approve,bool,0,0.0,2,"[True, False]"


In [27]:
y = df_1['Approve']
X = df_1.drop('Approve', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [28]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

In [29]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [30]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [31]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [32]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [33]:
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

Meilleurs hyperparamètres trouvés :
{'model__max_depth': 25, 'model__max_features': None, 'model__max_samples': 0.7, 'model__min_samples_leaf': 80, 'model__min_samples_split': 3, 'model__n_estimators': 70}
Meilleure précision trouvée : 0.8971149412917802


In [34]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [35]:
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)

F1-score macro sur l'ensemble de test : 0.8977395360163898
