In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn import set_config

from loader_csv import charger_csv
from data_to_csv import df_to_csv
from utils import generate_description
from currency import currency_cleaning
from naics_engineering import naicsEngineering


# partie1 = "../Dataset/SBAnational_part_part1.csv"
# partie2 = "../Dataset/SBAnational_part_part2.csv"
# partie3 = "../Dataset/SBAnational_part_part3.csv"
# partie4 = "../Dataset/SBAnational_part_part4.csv"


# df = charger_csv(partie1, partie2, partie3, partie4)

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')


In [2]:
generate_description(df_0)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[23, 11, 31-33, 44-45, 56]"
1,Term,int64,0,0.0,377,"[43, 36, 84, 59]"
2,NoEmp,int64,0,0.0,499,"[6, 18, 90, 7]"
3,NewExist,float64,0,0.0,2,"[1.0, 2.0]"
4,CreateJob,int64,0,0.0,216,"[0, 4]"
5,RetainedJob,int64,0,0.0,323,"[0, 5, 2, 4]"
6,FranchiseCode,int64,0,0.0,2,"[0, 1]"
7,UrbanRural,int64,0,0.0,3,[1]
8,RevLineCr,object,0,0.0,2,[N]
9,LowDoc,object,0,0.0,2,"[N, Y]"


In [3]:
df_0.shape

(675026, 13)

In [4]:
leaking_feature = ['ChgOffPrinGr', 'BalanceGross', 'DisbursementGross', 'DisbursementDate', 'ChgOffDate']
useless_feature = ['Name', 'Zip', 'LoanNr_ChkDgt', 'City', 'State', 'Bank', 'BankState', 'ApprovalDate', 'ApprovalFY']
df = df_0.drop(leaking_feature, axis=1)
df = df_0.drop(useless_feature, axis=1)
df.dropna(inplace=True)

KeyError: "['ChgOffPrinGr', 'BalanceGross', 'DisbursementGross', 'DisbursementDate', 'ChgOffDate'] not found in axis"

In [None]:
df.shape

(674400, 13)

In [None]:
df.head()

Unnamed: 0,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv,Approve
0,44-45,84,4,2.0,0,0,1,0,N,Y,60000,48000,True
1,72,60,2,2.0,0,0,1,0,N,Y,40000,32000,True
2,62,180,7,1.0,0,0,1,0,N,N,287000,215250,True
5,31-33,120,19,1.0,0,0,1,0,N,N,517000,387750,True
7,81,84,1,2.0,0,0,1,0,N,Y,45000,36000,True


In [None]:
df_to_csv(df, folder_path, 'SBA_Cleaned_maybe.csv')

Le DataFrame a été converti avec succès en fichier CSV : ../Dataset/SBA_Cleaned_maybe.csv


In [None]:
y = df['Approve']
X = df.drop('Approve', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

In [None]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")


In [None]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [None]:
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

Meilleurs hyperparamètres trouvés :
{'model__max_depth': 25, 'model__max_features': None, 'model__max_samples': 0.7, 'model__min_samples_leaf': 80, 'model__min_samples_split': 3, 'model__n_estimators': 70}
Meilleure précision trouvée : 0.9058280358051691


In [None]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [None]:
f1_macro = f1_score(y_test, predictions, average='macro')

print("F1-score macro sur l'ensemble de test :", f1_macro)

F1-score macro sur l'ensemble de test : 0.9065352017150367


In [None]:
maybe_leaking_feature = ['RetainedJob', 'CreateJob']
df = df.drop(maybe_leaking_feature, axis=1)

In [None]:
generate_description(df)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[56, 81, 48-49, 44-45, 72]"
1,Term,int64,0,0.0,377,"[84, 120, 108]"
2,NoEmp,int64,0,0.0,499,"[2, 4, 1]"
3,NewExist,float64,0,0.0,2,"[2.0, 1.0]"
4,FranchiseCode,int64,0,0.0,2,"[1, 0]"
5,UrbanRural,int64,0,0.0,3,"[1, 2]"
6,RevLineCr,object,0,0.0,2,"[Y, N]"
7,LowDoc,object,0,0.0,2,[N]
8,GrAppv,int64,0,0.0,16868,"[1000000, 195000, 44000, 205000, 100000]"
9,SBA_Appv,int64,0,0.0,28200,"[37500, 337500, 72250, 9000, 12500]"


In [None]:
y = df['Approve']
X = df.drop('Approve', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

In [None]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [None]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [None]:
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

Meilleurs hyperparamètres trouvés :
{'model__max_depth': 25, 'model__max_features': None, 'model__max_samples': 0.7, 'model__min_samples_leaf': 80, 'model__min_samples_split': 3, 'model__n_estimators': 70}
Meilleure précision trouvée : 0.9056260242709107


In [None]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [None]:
f1_macro = f1_score(y_test, predictions, average='macro')

print("F1-score macro sur l'ensemble de test :", f1_macro)

F1-score macro sur l'ensemble de test : 0.9054926582537084


In [None]:
maybe_leaking_fea = ['SBA_Appv', 'GrAppv']
df = df_0.drop(maybe_leaking_fea, axis=1)

In [None]:
generate_description(df)

In [None]:
y = df['Approve']
X = df.drop('Approve', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

In [None]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=42))
])

set_config(transform_output="pandas")

In [None]:
param_grid = {
    'model__n_estimators': [70],
    'model__max_depth': [25],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [80],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__max_samples': [0.7]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [None]:
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

In [None]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [None]:
f1_macro = f1_score(y_test, predictions, average='macro')

print("F1-score macro sur l'ensemble de test :", f1_macro)