### 1. Exploration des données

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.preprocessing import StandardScaler
import pickle

In [5]:
sns.set(style="whitegrid")

Chargement du dataset

In [7]:
data = pd.read_csv("data/Loan_Data.csv")
print("Shape du dataset :", data.shape)
data.head()

Shape du dataset : (10000, 8)


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


### 2. Statistiques descriptives

In [8]:
# vérification des valeurs manquantes
print("Valeurs manquantes par colonne :")
display(data.isnull().sum())

Valeurs manquantes par colonne :


customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


On voit que pandas considère customer_id(identifiant) et default(Variable cible binaire
) comme des colonnes numériques(int64) à inclure dans describe()

In [10]:
data['customer_id'] = data['customer_id'].astype('object')
data['default'] = data['default'].astype('category')

In [11]:
print("Statistiques descriptives :")
data.describe()

Statistiques descriptives :


Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577
std,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906
min,0.0,46.783973,31.652732,1000.0,0.0,408.0
25%,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0
50%,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0
75%,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0
max,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0


In [12]:
data.describe().to_csv("data/descriptive_Data.csv")

### 3. Visualisation des variables

Histogramme du fico_score

In [13]:
fig = px.histogram(
    data,
    x='fico_score',
    nbins=20,
    title="Distribution du FICO Score",
    color_discrete_sequence=["#1f77b4"],
    marginal="box"
)

fig.update_layout(
    xaxis_title="FICO Score",
    yaxis_title="Fréquence",
    template="plotly_white"
)

fig.show()

Boxplot income vs default

In [14]:
fig = px.box(data, x='default', y='income', color='default',
             color_discrete_sequence=px.colors.qualitative.Set2,
             points="all",  # affiche tous les points
             title="income vs default")

fig.update_layout(xaxis_title="default", yaxis_title="income")
fig.show()

Matrice de corrélation interactive

In [15]:
numeric_cols = data.select_dtypes(include=np.number).columns.tolist()
exclude_cols = ['customer_id']
numeric_cols = [col for col in numeric_cols if col not in exclude_cols]

if 'default' not in numeric_cols:
    numeric_cols.append('default')

corr = data[numeric_cols].corr()
z_text = np.round(corr.values, 2)

fig = ff.create_annotated_heatmap(
    z=corr.values,
    x=list(corr.columns),
    y=list(corr.columns),
    annotation_text=z_text,
    colorscale="Cividis",
    zmin=-1, zmax=1,
    showscale=True
)

fig.update_layout(
    title={
        'text': "Matrice de corrélation ",
        'y':0.95,
        'x':0.5,
        'xanchor':'center',
        'yanchor':'top'
    },
    width=800,
    height=600
)
fig.update_xaxes(tickangle=45, tickfont=dict(size=10))
fig.update_yaxes(tickangle=0, tickfont=dict(size=10))
fig.show()

### 3. Prétaitement des données

In [16]:
X = data[['credit_lines_outstanding', 'loan_amt_outstanding',
          'total_debt_outstanding', 'income', 'years_employed', 'fico_score']]

y = data['default']

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
processed = pd.DataFrame(X_scaled, columns=X.columns)
processed['default'] = y
processed.to_csv("data/loan_data_preprocessed.csv", index=False)