In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date, datetime
import numpy as np
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import roc_auc_score, f1_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from lightgbm import LGBMClassifier
import numpy as np
import shap
import cloudpickle

pd.set_option("display.max_columns", 100)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_data = pd.read_csv("dataset.csv")
base_data.head(1)

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv,cat_activites,SBA_loan_float,bank_loan_float,crisis
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,1997-02-28,1997,84,4,2.0,0,0,1,0,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00",45,48000.0,60000.0,2


In [3]:
data = base_data[["State", "Term", "NoEmp", "UrbanRural", "cat_activites", "bank_loan_float", "MIS_Status", "SBA_loan_float", "FranchiseCode", "LowDoc", "Bank"]]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, data["MIS_Status"], test_size=0.1, random_state=42, stratify=data["MIS_Status"])

X_train = X_train.drop("MIS_Status", axis=1)
X_test = X_test.drop("MIS_Status", axis=1)


In [6]:
preprocessor = make_pipeline(make_column_transformer((OneHotEncoder(), ["State", "LowDoc"]),remainder="passthrough"))

In [7]:
model = make_pipeline(preprocessor, RandomForestClassifier(10, max_depth=5, random_state=42))

model.fit(X_train, y_train)



ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [None]:
y_pred = model.predict(X_test)


In [None]:
y_pred_encoded = []
for y in y_pred:
    if y == "P I F":
        y_pred_encoded.append(1)
    else :
        y_pred_encoded.append(0)


In [None]:
probabilities = model.predict_proba(X_test)[:,1]

In [None]:
print("Train score : ", model.score(X_train, y_train))
print("Test score : ", model.score(X_test, y_test))
print("ROC-AUC score :", roc_auc_score(y_test, y_pred_encoded))

print("F1-Score : ", f1_score(y_test, y_pred, pos_label="P I F"))

In [None]:
print(classification_report(y_test, y_pred=y_pred))

In [None]:
feature_names = model[0].get_feature_names_out()  # Extract feature names
importances = model[-1].feature_importances_  # Extract importance values

# Create a DataFrame
df = pd.DataFrame({"Feature": feature_names, "Importance": importances})

# Separate features containing "state" and those that don't
df_state = df[df["Feature"].str.contains("State", case=False, na=False)]
df_other = df[~df["Feature"].str.contains("State", case=False, na=False)]

# Aggregate "state" features into a single row
aggregated_row = pd.DataFrame([{"Feature": "Aggregated_state", "Importance": df_state["Importance"].sum()}])

# Combine back the aggregated row with other features
df_final = pd.concat([df_other, aggregated_row], ignore_index=True)
plt.figure(figsize=(25, 10))
sns.barplot(df_final, x="Feature", y="Importance")
plt.tight_layout()
plt.show()

### LIGHTGBM

In [6]:
X_train_lgb = X_train.copy()
X_test_lgb = X_test.copy()

for col in ["State", "LowDoc", "Bank"]:
    X_train_lgb[col] = X_train_lgb[col].astype("category").cat.codes 
    X_test_lgb[col] = X_train_lgb[col].astype("category").cat.codes 

# X_train_lgb[["State", "LowDoc", "Bank"]] = X_train_lgb[["State", "LowDoc", "Bank"]].astype("category")
# X_test_lgb[["State", "LowDoc", "Bank"]] = X_test_lgb[["State", "LowDoc", "Bank"]].astype("category")



In [9]:
# preprocessor_gbm = make_pipeline(make_column_transformer((OneHotEncoder(), ["State", "LowDoc", "cat_activites"]),remainder="passthrough"))

In [7]:
lgb = LGBMClassifier(learning_rate=0.6, n_estimators=100, max_depth=13, min_child_samples=60, num_leaves=70, boosting_type="dart")
# lgb = LGBMClassifier(learning_rate=0.27785, n_estimators=192, max_depth=16, subsample=0.90, boosting_type="dart")

lgb.fit(X_train_lgb, y_train)

[LightGBM] [Info] Number of positive: 667210, number of negative: 142037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1600
[LightGBM] [Info] Number of data points in the train set: 809247, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.824483 -> initscore=1.547017
[LightGBM] [Info] Start training from score 1.547017


In [5]:
single = pd.read_csv("single.csv")
single = single[["State", "Term", "NoEmp", "UrbanRural", "cat_activites", "bank_loan_float", "SBA_loan_float", "FranchiseCode", "LowDoc", "Bank"]]
single[["State", "LowDoc", "Bank"]] = single[["State", "LowDoc", "Bank"]].astype("category")
single.dtypes


# lgb.predict(single)

State              category
Term                  int64
NoEmp                 int64
UrbanRural            int64
cat_activites         int64
bank_loan_float     float64
SBA_loan_float      float64
FranchiseCode         int64
LowDoc             category
Bank               category
dtype: object

In [12]:
# cat_b = CatBoostClassifier()
# cat_b.fit(X_test_lgb, y_train)

In [13]:
y_pred = lgb.predict(X_test_lgb)
lgb_proba = lgb.predict_proba(X_test_lgb)[:,1]
y_pred_encoded = []
for y in y_pred:
    if y == "P I F":
        y_pred_encoded.append(1)
    else :
        y_pred_encoded.append(0)

In [10]:
metadata = {
    "model": lgb,
    "categorical_features": ["State", "LowDoc", "Bank"]
}
with open("lightgbm_model.pickle", "wb") as f:
    cloudpickle.dump(metadata, f)

In [None]:
print("Train score : ", lgb.score(X_train_lgb, y_train))
print("Test score : ", lgb.score(X_test_lgb, y_test))
print("ROC-AUC score :", roc_auc_score(y_test, lgb_proba))

print("F1-Score : ", f1_score(y_test, y_pred, pos_label="P I F"))

In [None]:
print(classification_report(y_test, y_pred=y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
lgb.get_params()



In [None]:
param = {
    "learning_rate": np.linspace(0, 1, 11),
    "max_depth": [x for x in range(1, 15, 1)],
    "n_estimators": [x for x in range(10, 101, 10)],
    "min_child_samples": [x for x in range(20, 100, 10)],
    "num_leaves": [x for x in range (10, 80, 5)]
}

In [None]:
# search = RandomizedSearchCV(lgb, param, n_iter=200, scoring='f1_macro', cv=5)

# search_grid = search.fit(X_train_lgb, y_train)



In [None]:

boost = lgb.booster_

feature_names = boost.feature_name()  # Extract feature names
importances = lgb.feature_importances_  # Extract importance values

# Create a DataFrame
df = pd.DataFrame({"Feature": feature_names, "Importance": importances})

# Separate features containing "state" and those that don't
df_state = df[df["Feature"].str.contains("State", case=False, na=False)]
df_other = df[~df["Feature"].str.contains("State", case=False, na=False)]

# Aggregate "state" features into a single row
aggregated_row = pd.DataFrame([{"Feature": "Aggregated_state", "Importance": df_state["Importance"].sum()}])

# Combine back the aggregated row with other features
df_final = pd.concat([df_other, aggregated_row], ignore_index=True)
plt.figure(figsize=(25, 10))
sns.barplot(df_final, x="Feature", y="Importance", palette="Set2")
plt.tight_layout()
plt.show()

In [None]:
# print(search_grid.best_params_)

In [None]:
# SHAP pour interprétabilité
explainer = shap.Explainer(lgb)
shap_values = explainer(X_test_lgb)
shap.summary_plot(shap_values, X_test_lgb)