In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

final_train = pd.read_excel('../data/final_train.xlsx' ,usecols=lambda x: x not in ['Unnamed: 0', 'Unnamed: 0.1'])
final_test = pd.read_excel('../data/final_test.xlsx', usecols=lambda x: x not in ['Unnamed: 0', 'Unnamed: 0.1'])
final_oot = pd.read_excel('../data/final_oot.xlsx' ,usecols=lambda x: x not in ['Unnamed: 0', 'Unnamed: 0.1'])

In [9]:
def data_structure(df, name):
  print(f"Data Structure for {name}:")
  print(f"Number of rows: {df.shape[0]}")
  print(f"Number of columns: {df.shape[1]}")
  print(f"Columns and their data types:\n{df.dtypes}")
  for col in df.columns:
    print(f"\nColumn: {col}")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Sample values: {df[col].unique()[:10]}")  # Print first 10 unique values
    if df[col].dtype == 'object':  # Check for object/string type
        print(f"  Value counts:\n{df[col].value_counts()}") # Print value counts for categorical variables

data_structure(final_train, "final_train")
# data_structure(final_oot, "final_oot")
# data_structure(final_test, "final_test")

Data Structure for final_train:
Number of rows: 14718
Number of columns: 8
Columns and their data types:
DDefaut_NDB                    int64
CODNAF2                        int64
CODETAJUR                      int64
CRTOC_AG_MVTAFF_2T_IND_0010    int64
CRTOC_AG_NBJCRE_2T_IND_0009    int64
CRTOC_IND_0164                 int64
CRTOC_AG_LIGDEB_2T_IND_0015    int64
CRTOC_IND_0015                 int64
dtype: object

Column: DDefaut_NDB
  Unique values: 2
  Sample values: [0 1]

Column: CODNAF2
  Unique values: 2
  Sample values: [1 0]

Column: CODETAJUR
  Unique values: 2
  Sample values: [1 0]

Column: CRTOC_AG_MVTAFF_2T_IND_0010
  Unique values: 3
  Sample values: [2 1 0]

Column: CRTOC_AG_NBJCRE_2T_IND_0009
  Unique values: 4
  Sample values: [3 2 0 1]

Column: CRTOC_IND_0164
  Unique values: 2
  Sample values: [1 0]

Column: CRTOC_AG_LIGDEB_2T_IND_0015
  Unique values: 2
  Sample values: [1 0]

Column: CRTOC_IND_0015
  Unique values: 3
  Sample values: [0 1 2]


In [14]:
# X_train : les variables indépendantes (encodées)
# y_train : la variable cible (DDefaut_NDB)

variables=final_train.columns.tolist()
variables.remove('DDefaut_NDB')

X_train = final_train[variables]
X_test = final_test[variables]
X_oot = final_oot[variables]

y_train = final_train['DDefaut_NDB']
y_test = final_test['DDefaut_NDB']
y_oot = final_oot['DDefaut_NDB']

# X_train = sm.add_constant(X_train)

In [22]:
for x in variables:
  taux_defaut = final_train.groupby(x)['DDefaut_NDB'].mean()
  print(taux_defaut)

CODNAF2
0    0.024540
1    0.065481
Name: DDefaut_NDB, dtype: float64
CODETAJUR
0    0.049131
1    0.062654
Name: DDefaut_NDB, dtype: float64
CRTOC_AG_MVTAFF_2T_IND_0010
0    0.021304
1    0.055508
2    0.201160
Name: DDefaut_NDB, dtype: float64
CRTOC_AG_NBJCRE_2T_IND_0009
0    0.018286
1    0.061114
2    0.141677
3    0.350485
Name: DDefaut_NDB, dtype: float64
CRTOC_IND_0164
0    0.027149
1    0.425000
Name: DDefaut_NDB, dtype: float64
CRTOC_AG_LIGDEB_2T_IND_0015
0    0.022641
1    0.227068
Name: DDefaut_NDB, dtype: float64
CRTOC_IND_0015
0    0.022745
1    0.097493
2    0.494382
Name: DDefaut_NDB, dtype: float64


In [20]:
CODNAF2 = {0: 1, 1: 0} 
final_train['CODNAF2'] = final_train['CODNAF2'].map(CODNAF2)
CRTOC_AG_MVTAFF_2T_IND_0010= {0: 2, 1: 1, 2:0} 
final_train['CRTOC_AG_MVTAFF_2T_IND_0010'] = final_train['CRTOC_AG_MVTAFF_2T_IND_0010'].map(CRTOC_AG_MVTAFF_2T_IND_0010)
CRTOC_AG_NBJCRE_2T_IND_0009={0: 3, 1: 2, 2:1,3:0} 
final_train['CRTOC_AG_NBJCRE_2T_IND_0009'] = final_train['CRTOC_AG_NBJCRE_2T_IND_0009'].map(CRTOC_AG_NBJCRE_2T_IND_0009)
CRTOC_IND_0164 = {0: 1, 1: 0} 
final_train['CRTOC_IND_0164'] = final_train['CRTOC_IND_0164'].map(CRTOC_IND_0164)
CRTOC_AG_LIGDEB_2T_IND_0015 = {0: 1, 1: 0} 
final_train['CRTOC_AG_LIGDEB_2T_IND_0015'] = final_train['CRTOC_AG_LIGDEB_2T_IND_0015'].map(CRTOC_AG_LIGDEB_2T_IND_0015)

In [28]:
import statsmodels.api as sm
from itertools import combinations
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score

def stepwise_selection(X, y, threshold_in=0.05, threshold_out=0.1):
    """
    Méthode de sélection stepwise (ajout et suppression).
    """
    initial_features = []
    selected_features = list(initial_features)
    remaining_features = list(X.columns)
    while True:
        changed = False

        # Ajout de la meilleure variable restante
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.Logit(y, sm.add_constant(X[selected_features + [new_column]])).fit(disp=0)
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            changed = True

        # Suppression de la pire variable existante
        model = sm.Logit(y, sm.add_constant(X[selected_features])).fit(disp=0)
        pvals = model.pvalues.iloc[1:]  # Exclure l'intercept
        worst_pval = pvals.max()
        if worst_pval > threshold_out:
            worst_feature = pvals.idxmax()
            selected_features.remove(worst_feature)
            remaining_features.append(worst_feature)
            changed = True

        if not changed:
            break

    return selected_features

def iterative_selection(X, y, variables, threshold=0.05):
    """
    Ajout itératif des variables par ordre de liaison avec la cible.
    """
    selected_vars = []
    results = {}
    for var in variables:
        model = sm.Logit(y, sm.add_constant(X[selected_vars + [var]])).fit(disp=0)
        if model.pvalues[var] < threshold:
            selected_vars.append(var)
            results[var] = model.summary()
    return selected_vars, results


def evaluate_model(model, X, y):
    """
    Calculer les métriques principales : Gini, AIC, BIC, AUC-PR, RP.
    """
    y_pred_prob = model.predict(sm.add_constant(X))
    auc_roc = roc_auc_score(y, y_pred_prob)
    gini = 2 * auc_roc - 1

    precision, recall, _ = precision_recall_curve(y, y_pred_prob)
    auc_pr = average_precision_score(y, y_pred_prob)

    prevalence = y.mean()
    auc_pr_random = prevalence
    rp = auc_pr / auc_pr_random

    return {
        "AUC-ROC": auc_roc,
        "Gini": gini,
        "AUC-PR": auc_pr,
        "RP": rp,
        "AIC": model.aic,
        "BIC": model.bic
    }


X_train = pd.DataFrame(X_train, columns=variables)
y_train = pd.Series(y_train)

# Méthode stepwise
selected_stepwise = stepwise_selection(X_train, y_train)

# Méthode itérative
ordered_vars = X_train.corrwith(y_train).abs().sort_values(ascending=False).index.tolist()
selected_iterative, iterative_results = iterative_selection(X_train, y_train, ordered_vars)

# Modèle final (par exemple avec les variables de la méthode stepwise)
final_model = sm.Logit(y_train, sm.add_constant(X_train[selected_stepwise])).fit(disp=0)
evaluation_results = evaluate_model(final_model, X_train, y_train)

# Afficher les résultats
print("\n=== Modèle final (Stepwise) ===")
print(final_model.summary())
print("\n=== Résultats d'évaluation ===")
print(evaluation_results)



=== Modèle final (Stepwise) ===
                           Logit Regression Results                           
Dep. Variable:            DDefaut_NDB   No. Observations:                14718
Model:                          Logit   Df Residuals:                    14710
Method:                           MLE   Df Model:                            7
Date:                Mon, 06 Jan 2025   Pseudo R-squ.:                  0.3904
Time:                        23:15:47   Log-Likelihood:                -2026.1
converged:                       True   LL-Null:                       -3323.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                          -0.1864      0.211     -0.882      0.378      -0.601       0.228
CRTOC_IND_0164                 -0.9784      0.1