In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt


df_model = pd.read_csv('loan_project/SBAnational.csv')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  df_model = pd.read_csv('loan_project/SBAnational.csv')


In [2]:
df_model = df_model.drop(['Name','City','Bank','CreateJob','ApprovalDate','RetainedJob','ChgOffDate', 'DisbursementDate','DisbursementGross','BalanceGross','ChgOffPrinGr'],axis=1)
df_model

Unnamed: 0,LoanNr_ChkDgt,State,Zip,BankState,NAICS,ApprovalFY,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv
0,1000014003,IN,47711,OH,451120,1997,84,4,2.0,1,0,N,Y,P I F,"$60,000.00","$48,000.00"
1,1000024006,IN,46526,IN,722410,1997,60,2,2.0,1,0,N,Y,P I F,"$40,000.00","$32,000.00"
2,1000034009,IN,47401,IN,621210,1997,180,7,1.0,1,0,N,N,P I F,"$287,000.00","$215,250.00"
3,1000044001,OK,74012,OK,0,1997,60,2,1.0,1,0,N,Y,P I F,"$35,000.00","$28,000.00"
4,1000054004,FL,32801,FL,0,1997,240,14,1.0,1,0,N,N,P I F,"$229,000.00","$229,000.00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899159,9995573004,OH,43221,IL,451120,1997,60,6,1.0,1,0,0,N,P I F,"$70,000.00","$56,000.00"
899160,9995603000,OH,43221,IL,451130,1997,60,6,1.0,1,0,Y,N,P I F,"$85,000.00","$42,500.00"
899161,9995613003,CA,93455,CA,332321,1997,108,26,1.0,1,0,N,N,P I F,"$300,000.00","$225,000.00"
899162,9995973006,HI,96830,HI,0,1997,60,6,1.0,1,0,N,Y,CHGOFF,"$75,000.00","$60,000.00"


In [3]:
df_model.isnull().sum()

LoanNr_ChkDgt       0
State              14
Zip                 0
BankState        1566
NAICS               0
ApprovalFY          0
Term                0
NoEmp               0
NewExist          136
FranchiseCode       0
UrbanRural          0
RevLineCr        4528
LowDoc           2582
MIS_Status       1997
GrAppv              0
SBA_Appv            0
dtype: int64

In [4]:
df_model['ApprovalFY'] = pd.to_numeric(df_model['ApprovalFY'], errors='coerce').astype('Int64')
df_model['GrAppv'] = pd.to_numeric(df_model['GrAppv'].str.replace('[\$,]', '', regex=True))
df_model['SBA_Appv'] = pd.to_numeric(df_model['SBA_Appv'].str.replace('[\$,]', '', regex=True))


df_model['RevLineCr'] = df_model['RevLineCr'].replace(('0','T','`',',','1','C','3','2','R','7','A','5','.','4','-','Q'), np.nan)
df_model['MIS_Status'] = df_model['MIS_Status'].replace(('CHGOFF','P I F'), (0,1))

condition = (df_model['GrAppv'] > 150000)
df_model.loc[condition, 'LowDoc'] = 'N'
df_model.loc[~condition, 'LowDoc'] = 'Y'

imputer = SimpleImputer(strategy='most_frequent')
df_model[['RevLineCr','NewExist', 'LowDoc']] = imputer.fit_transform(df_model[['RevLineCr','NewExist', 'LowDoc']])

imputer = SimpleImputer(strategy='most_frequent')
df_model[['ApprovalFY']] = imputer.fit_transform(df_model[['ApprovalFY']])


  df_model['MIS_Status'] = df_model['MIS_Status'].replace(('CHGOFF','P I F'), (0,1))


In [5]:
# Dictionnaire de correspondance des codes NAICS aux catégories
df_model['NAICS_Category'] = df_model['NAICS'].astype(str).str[:2]
df_model['NAICS_Category'] = df_model['NAICS_Category'].astype(int)



naics_categories = {
    '0' : 'Inconnue',
    '11': 'Agriculture, Forestry, Fishing and Hunting',
    '21': 'Mining, Quarrying, and Oil and Gas Extraction',
    '22': 'Utilities',
    '23': 'Construction',
    '31': 'Manufacturing',
    '32': 'Manufacturing',
    '33': 'Manufacturing',
    '42': 'Wholesale Trade',
    '44': 'Retail Trade',
    '45': 'Retail Trade',
    '48': 'Transportation and Warehousing',
    '49': 'Transportation and Warehousing',
    '51': 'Information',
    '52': 'Finance and Insurance',
    '53': 'Real Estate and Rental and Leasing',
    '54': 'Professional, Scientific, and Technical Services',
    '55': 'Management of Companies and Enterprises',
    '56': 'Administrative and Support and Waste Management and Remediation Services',
    '61': 'Educational Services',
    '62': 'Health Care and Social Assistance',
    '71': 'Arts, Entertainment, and Recreation',
    '72': 'Accommodation and Food Services',
    '81': 'Other Services (except Public Administration)',
    '92': 'Public Administration'
}

# Appliquer la correspondance à la colonne contenant les deux premiers chiffres des codes NAICS
df_model['NAICS_Category'] = df_model['NAICS'].astype(str).str[:2].map(naics_categories)


df_model = df_model.drop('NAICS', axis=1)

df_model

Unnamed: 0,LoanNr_ChkDgt,State,Zip,BankState,ApprovalFY,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,NAICS_Category
0,1000014003,IN,47711,OH,1997.0,84,4,2.0,1,0,N,Y,1.0,60000.0,48000.0,Retail Trade
1,1000024006,IN,46526,IN,1997.0,60,2,2.0,1,0,N,Y,1.0,40000.0,32000.0,Accommodation and Food Services
2,1000034009,IN,47401,IN,1997.0,180,7,1.0,1,0,N,N,1.0,287000.0,215250.0,Health Care and Social Assistance
3,1000044001,OK,74012,OK,1997.0,60,2,1.0,1,0,N,Y,1.0,35000.0,28000.0,Inconnue
4,1000054004,FL,32801,FL,1997.0,240,14,1.0,1,0,N,N,1.0,229000.0,229000.0,Inconnue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899159,9995573004,OH,43221,IL,1997.0,60,6,1.0,1,0,N,Y,1.0,70000.0,56000.0,Retail Trade
899160,9995603000,OH,43221,IL,1997.0,60,6,1.0,1,0,Y,Y,1.0,85000.0,42500.0,Retail Trade
899161,9995613003,CA,93455,CA,1997.0,108,26,1.0,1,0,N,N,1.0,300000.0,225000.0,Manufacturing
899162,9995973006,HI,96830,HI,1997.0,60,6,1.0,1,0,N,Y,0.0,75000.0,60000.0,Inconnue


In [6]:
colonne_a_tester = ['NewExist','FranchiseCode','UrbanRural','RevLineCr','LowDoc','GrAppv','SBA_Appv']

for i in colonne_a_tester:
    nombre_de_zeros = (df_model[i] == 0).sum()
    print(f"Nombre de zéros dans la colonne {i}:", nombre_de_zeros)

Nombre de zéros dans la colonne NewExist: 1034
Nombre de zéros dans la colonne FranchiseCode: 208835
Nombre de zéros dans la colonne UrbanRural: 323167
Nombre de zéros dans la colonne RevLineCr: 0
Nombre de zéros dans la colonne LowDoc: 0
Nombre de zéros dans la colonne GrAppv: 0
Nombre de zéros dans la colonne SBA_Appv: 0


In [7]:
nombre_de_zeros = (df_model['NAICS'] >= 1).sum()
print(f"Nombre de NAICS :", nombre_de_zeros)
nombre_de_zeros = (df_model['NAICS'] == 0).sum()
print(f"Nombre de non NAICS :", nombre_de_zeros)

KeyError: 'NAICS'

In [None]:
df_model = df_model.dropna()

df_model.isnull().sum()

LoanNr_ChkDgt     0
State             0
Zip               0
BankState         0
NAICS             0
ApprovalFY        0
Term              0
NoEmp             0
NewExist          0
FranchiseCode     0
UrbanRural        0
RevLineCr         0
LowDoc            0
MIS_Status        0
GrAppv            0
SBA_Appv          0
NAICS_Category    0
dtype: int64

In [None]:
# X = df_model.drop('MIS_Status', axis=1)
# y = df_model.MIS_Status


# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=y)


# num_col = list(X.select_dtypes(include=[float,int]).columns)
# cat_col = list(X.select_dtypes(include=[object]).columns)

# onehotscale_pipeline = make_pipeline(OneHotEncoder(), RobustScaler(with_centering=False))
# scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

# preprocessing = ColumnTransformer(
#     transformers=[
#         ('categorical', onehotscale_pipeline, cat_col),
#         ('numerical', scale_pipeline, num_col)]
# )

# my_final_pipeline = make_pipeline(preprocessing)
# my_final_pipeline.fit(X_train)

# feature_names = my_final_pipeline.get_feature_names_out(X.columns)

# model = make_pipeline(
#     my_final_pipeline,
#     RandomForestClassifier(random_state=42,max_depth=30)
# )

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)


# classes = model.classes_
# print(classes)

# report = classification_report(y_test, y_pred)

# print(report)


# print("Confusion Matrix:")
# display(confusion_matrix(y_test, y_pred))

# # faire une feature importance

In [None]:
# X = df_model.drop('MIS_Status', axis=1)
# y = df_model.MIS_Status


# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.95, random_state=42, stratify=y)


# num_col = list(X.select_dtypes(include=[float,int]).columns)
# cat_col = list(X.select_dtypes(include=[object]).columns)

# onehotscale_pipeline = make_pipeline(OneHotEncoder(), RobustScaler(with_centering=False))
# scale_pipeline = make_pipeline(RobustScaler(with_centering=False))


# xgb_pipeline = make_pipeline(
#     OneHotEncoder()
#     XGBClassifier(random_state=42)
# )

# # Entraîner le modèle
# xgb_pipeline.fit(X_train, y_train)

# # Prédire sur l'ensemble de test
# y_pred_xgb = xgb_pipeline.predict(X_test)

# # Évaluer les performances
# report_xgb = classification_report(y_test, y_pred_xgb)
# print(report_xgb)

# # Afficher la matrice de confusion
# display(confusion_matrix(y_test, y_pred_xgb))

In [None]:
# from sklearn.model_selection import learning_curve
# import matplotlib.pyplot as plt

# train_sizes, train_scores, validation_scores = learning_curve(
#     estimator = model,
#     X = X_train,
#     y = y_train,
#     train_sizes = np.linspace(0.1, 1.0, 10),
#     cv = 5,
#     scoring = 'neg_log_loss'
# )

# #recuperation des moyennes et des écarts types(facultatif) à chaque cv
# train_scores_mean = -train_scores.mean(axis = 1)
# train_scores_std = train_scores.std(axis = 1)
# validation_scores_mean = -validation_scores.mean(axis = 1)
# validation_scores_std = validation_scores.std(axis = 1)


# plt.figure()
# plt.title("Learning Curve (Arbre de décision)")
# plt.xlabel("Training examples")
# plt.ylabel("loss")
# plt.gca().invert_yaxis() #inverser y car métrique de perte

# # afficher les points
# plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
# plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

# # relier les points
# plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
# plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1, color="g")


# plt.legend()
# plt.show()