In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)


In [2]:
# Table principale
application_train = pd.read_csv("../data/raw/application_train.csv")

# Tables secondaires
bureau = pd.read_csv("../data/raw/bureau.csv")
bureau_balance = pd.read_csv("../data/raw/bureau_balance.csv")
previous_application = pd.read_csv("../data/raw/previous_application.csv")


In [3]:
print("Clients uniques :", application_train['SK_ID_CURR'].nunique())
print("Clients dans bureau :", bureau['SK_ID_CURR'].nunique())


Clients uniques : 307511
Clients dans bureau : 305811


In [4]:
# Comptage des statuts par crédit
bureau_balance_agg = (
    bureau_balance
    .groupby('SK_ID_BUREAU')
    .agg({
        'MONTHS_BALANCE': ['min', 'max', 'count']
    })
)

bureau_balance_agg.columns = ['BB_MIN_MONTH', 'BB_MAX_MONTH', 'BB_COUNT']
bureau_balance_agg.head()


Unnamed: 0_level_0,BB_MIN_MONTH,BB_MAX_MONTH,BB_COUNT
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5001709,-96,0,97
5001710,-82,0,83
5001711,-3,0,4
5001712,-18,0,19
5001713,-21,0,22


In [5]:
bureau = bureau.merge(
    bureau_balance_agg,
    how='left',
    on='SK_ID_BUREAU'
)


In [6]:
bureau_agg = (
    bureau
    .groupby('SK_ID_CURR')
    .agg({
        'DAYS_CREDIT': ['mean', 'min'],
        'AMT_CREDIT_SUM': ['mean', 'sum'],
        'CREDIT_DAY_OVERDUE': ['mean', 'max'],
        'BB_COUNT': ['mean']
    })
)

bureau_agg.columns = [
    'BURO_DAYS_CREDIT_MEAN',
    'BURO_DAYS_CREDIT_MIN',
    'BURO_CREDIT_SUM_MEAN',
    'BURO_CREDIT_SUM_TOTAL',
    'BURO_OVERDUE_MEAN',
    'BURO_OVERDUE_MAX',
    'BURO_BB_COUNT_MEAN'
]

bureau_agg.head()


Unnamed: 0_level_0,BURO_DAYS_CREDIT_MEAN,BURO_DAYS_CREDIT_MIN,BURO_CREDIT_SUM_MEAN,BURO_CREDIT_SUM_TOTAL,BURO_OVERDUE_MEAN,BURO_OVERDUE_MAX,BURO_BB_COUNT_MEAN
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100001,-735.0,-1572,207623.571429,1453365.0,0.0,0,24.571429
100002,-874.0,-1437,108131.945625,865055.565,0.0,0,13.75
100003,-1400.75,-2586,254350.125,1017400.5,0.0,0,
100004,-867.0,-1326,94518.9,189037.8,0.0,0,
100005,-190.666667,-373,219042.0,657126.0,0.0,0,7.0


In [7]:
previous_application_agg = (
    previous_application
    .groupby('SK_ID_CURR')
    .agg({
        'AMT_APPLICATION': ['mean'],
        'AMT_CREDIT': ['mean'],
        'DAYS_DECISION': ['mean'],
        'CNT_PAYMENT': ['mean']
    })
)

previous_application_agg.columns = [
    'PREV_AMT_APPLICATION_MEAN',
    'PREV_AMT_CREDIT_MEAN',
    'PREV_DAYS_DECISION_MEAN',
    'PREV_CNT_PAYMENT_MEAN'
]

previous_application_agg.head()


Unnamed: 0_level_0,PREV_AMT_APPLICATION_MEAN,PREV_AMT_CREDIT_MEAN,PREV_DAYS_DECISION_MEAN,PREV_CNT_PAYMENT_MEAN
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100001,24835.5,23787.0,-1740.0,8.0
100002,179055.0,179055.0,-606.0,24.0
100003,435436.5,484191.0,-1305.0,10.0
100004,24282.0,20106.0,-815.0,4.0
100005,22308.75,20076.75,-536.0,12.0


In [8]:
# Fusion bureau
application_train_fe = application_train.merge(
    bureau_agg,
    how='left',
    on='SK_ID_CURR'
)

# Fusion previous_application
application_train_fe = application_train_fe.merge(
    previous_application_agg,
    how='left',
    on='SK_ID_CURR'
)

application_train_fe.shape
application_train_fe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 133 entries, SK_ID_CURR to PREV_CNT_PAYMENT_MEAN
dtypes: float64(76), int64(41), object(16)
memory usage: 312.0+ MB


**Gestion des valeurs manquantes (simple & propre)**

In [9]:
# Séparation features / target
target = application_train_fe["TARGET"]
features = application_train_fe.drop(columns=["TARGET", "SK_ID_CURR"])

# Sélection des variables numériques
numeric_features = features.select_dtypes(include=["int64", "float64"])

# Calcul des médianes sur le train
medians = numeric_features.median()

# Remplacement des NaN
numeric_features = numeric_features.fillna(medians)
numeric_features.isna().sum().sum()



np.int64(0)

**Encodage des variables catégorielles**


Encodage simple pour commencer (One-Hot Encoding)

In [10]:
categorical_features = features.select_dtypes(include=['object'])

categorical_features = pd.get_dummies(
    categorical_features,
    drop_first=True
)


In [11]:
X = pd.concat([numeric_features, categorical_features], axis=1)
y = target

print("Dimensions X :", X.shape)
print("Dimensions y :", y.shape)


Dimensions X : (307511, 239)
Dimensions y : (307511,)


In [12]:
# 1. Chargement des données brutes
application_train_fe = pd.read_csv("../data/raw/application_train.csv")


# 3. Encodage des variables catégorielles
application_train_fe = pd.get_dummies(application_train_fe, drop_first=True)

# 4. Sauvegarde
application_train_fe.to_csv("../data/data_fe.csv", index=False)


In [13]:
application_train_fe.to_csv("../data/data_fe.csv", index=False)



## Conclusion – Feature Engineering

Cette étape a permis :
- de fusionner plusieurs sources de données hétérogènes
- de créer des variables agrégées au niveau client
- de préparer un dataset final compatible avec les algorithmes de machine learning

Le dataset est désormais prêt pour la phase de modélisation.
