# Librairies

In [None]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# shap and lime for feature importance (global and local)
import shap
import lime

# sklearn 
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.base import clone
from sklearn.impute import SimpleImputer

import gc
import lightgbm as lgb

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Functions

### Missing Values Table

In [1]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

### Alignement des données

In [None]:
def align_train_test(app_train, app_test, target_col):
    # Sauvegarder la colonne cible
    train_labels = app_train[target_col]
    
    # Supprimer la cible du train pour ne pas perturber l’alignement
    app_train_wo_target = app_train.drop(columns=[target_col])
    
    # Aligner les colonnes entre train et test (intersection)
    app_train_aligned, app_test_aligned = app_train_wo_target.align(app_test, join='inner', axis=1)
    
    # Réinsérer la colonne cible
    app_train_aligned[target_col] = train_labels
    
    return app_train_aligned, app_test_aligned

### Création du Train Test Split

In [None]:
def create_split(df, target, test_size=0.2, random_state=42):
    X = df.drop(columns=[target])
    y = df[target]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

### Selection des colonnes à encoder

In [None]:
def def_cols(df):

    le_count = 0
    oh_count = 0
    label_cols = []
    onehot_cols = []

    # Iterate through the columns
    for col in df.columns:
        if df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(df[col].unique())) <= 2:
                # Add col to label_cols
                label_cols.append(col)
                
                # Keep track of how many columns were label encoded
                le_count += 1
    
    onehot_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    oh_count = len(onehot_cols)

    print('%d columns were label encoded.' % le_count)
    print('%d columns were onehot encoded.' % oh_count)

    return {
        'label_cols': label_cols,
        'onehot_cols': onehot_cols,
    }

### Encodage Label et OneHot 

In [None]:
def encoder(label_cols, onehot_cols):
    return ColumnTransformer(
        transformers=[
            (
                'label', 
                OrdinalEncoder(), 
                label_cols
            ),
            (
                'onehot', 
                OneHotEncoder(
                    handle_unknown='ignore', 
                    sparse_output=False
                ), 
                onehot_cols
            )
        ],
        remainder='passthrough'
    )

### Polynomial Features

In [None]:
def polynomial(poly_cols, degree=2):
    return ColumnTransformer(
        transformers=[
            (
                'poly_scaled',
                Pipeline([
                    (
                        'imputer', 
                        SimpleImputer(
                            strategy='median'
                        )
                    ),
                    (
                        'poly', 
                        PolynomialFeatures(
                            degree=degree, 
                            include_bias=False,
                            sparse_output=False
                        )
                    ),
                    (
                        'scaler', 
                        StandardScaler()
                    )
                ]),
                poly_cols
            )
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    )

### Scaler

In [None]:
def scaler(minmax_cols,standard_cols):
    return ColumnTransformer(
        transformers=[
            (
                'MinMax', 
                MinMaxScaler(), 
                minmax_cols
            ),
            (
                'Standard', 
                StandardScaler(), 
                standard_cols
            ),
        ],
        remainder='passthrough'
    )


### Création de la Pipeline

In [None]:
def build_pipeline(model, label_cols, onehot_cols, poly_cols, minmax_cols, standard_cols):
    return  Pipeline(
        steps=[
            (
                'Encoder',
                encoder(
                    label_cols=label_cols, 
                    onehot_cols=onehot_cols
                )
            ),
            (
                'Polynomial',
                polynomial(
                    poly_cols=poly_cols
                )
            ),
            (
                'Scaler',
                scaler(
                    minmax_cols=minmax_cols,
                    standard_cols=standard_cols
                )
            ),
            (
                'Model', model
            )
        ]
    )
