In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

# Suppression des lignes contenant des valeurs manquantes
df = pd.read_csv('data/SBA_model.csv')

# Séparer les variables indépendantes et la variable dépendante
X = df.drop(['MIS_Status'], axis=1)
y = df['MIS_Status']

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42, stratify=y)
# Créer le transformateur pour les colonnes catégorielles
categorical_features = [  'ApprovalDate','FranchiseCode','Naics','LowDoc','UrbanRural','RevLineCr','NewExist']
categorical_transformer = Pipeline(steps=[
    
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Créer le transformateur pour les colonnes numériques
numerical_features =['Term','NoEmp','ApprovalFY','GrAppv','CreateJob','RetainedJob']
numerical_transformer = StandardScaler()

# Créer le ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
        ])

# Créer le pipeline
pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', XGBClassifier())])

param_grid = {
    'classifier__n_estimators': [1000],
    'classifier__learning_rate': [0.1],
    'classifier__max_depth': [8],
    'classifier__gamma': [3],
    'classifier__min_child_weight': [5],
    'classifier__subsample': [0.5],
    'classifier__colsample_bytree': [1],
    'classifier__reg_alpha': [1]
}
grid_search = GridSearchCV(pipeline_xgb , param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train, )



# Prédire les classes pour les données de test
y_pred = grid_search.predict(X_test)
# Entraîner le modèle
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [2]:
import pickle



# Sauvegarder le modèle
with open('modele_usba.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

In [4]:

real_values = pd.DataFrame({
    'ApprovalDate': ['7-Feb-06'],
    'Term': [126],
    'NoEmp': [7],
    'FranchiseCode': ['No franchise'],
    'Naics': ['Accommodation and food services'],
    'ApprovalFY': [2006],
    'NewExist': [' Existing business'],
    'LowDoc': ['Yes Loan Program'],
    'GrAppv': [13730000],
    'CreateJob': [0],
    'RetainedJob': [0],
    'UrbanRural': ['Urban Zone'],
    'RevLineCr': ['No']
})

input_data = pd.DataFrame(real_values, index=[0])



with open('modele_usba.pkl', 'rb') as file:
    model = pickle.load(file)


y_pred_input = model.predict(input_data)



print("Prédictions:", y_pred_input)

Prédictions: [1]


In [3]:
df = pd.read_csv('data/SBA_model.csv')

In [5]:
df.tail()

Unnamed: 0,ApprovalDate,Term,NoEmp,FranchiseCode,Naics,ApprovalFY,NewExist,LowDoc,GrAppv,CreateJob,RetainedJob,UrbanRural,RevLineCr,MIS_Status
249574,2006-02-06,7,6,No franchise,Wholesale trade,2006,Existing business,Yes Loan Program,8500000,1,7,Urban Zone,Yes,0
249575,2006-02-06,60,1,No franchise,Construction,2006,Existing business,Yes Loan Program,5000000,0,1,Urban Zone,Yes,0
249576,2006-02-06,84,5,No franchise,Construction,2006,Existing business,Yes Loan Program,1300000,1,5,Rural Zone,Yes,1
249577,2006-02-06,84,2,No franchise,Educational services,2006,Existing business,Yes Loan Program,3000000,0,2,Rural Zone,Yes,1
249578,2006-02-06,84,3,No franchise,Information,2006,Existing business,Yes Loan Program,1000000,0,3,Urban Zone,Yes,1
