In [5]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
import sys

In [6]:
df_train = pd.read_csv('../../data/X_train.csv')
df_drift = pd.read_csv('../../data/X_drift.csv')

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22481 entries, 0 to 22480
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PolicyId           22481 non-null  object 
 1   AgeConducteur      22481 non-null  float64
 2   SexeConducteur     22481 non-null  object 
 3   StatutMatrimonial  7372 non-null   object 
 4   BonusMalus         22481 non-null  float64
 5   FrequencePaiement  22481 non-null  object 
 6   CodeProfession     7372 non-null   object 
 7   AgeVehicule        22481 non-null  float64
 8   ClasseVehicule     22481 non-null  object 
 9   PuissanceVehicule  22481 non-null  object 
 10  CarburantVehicule  22481 non-null  object 
 11  UsageVehicule      22481 non-null  object 
 12  Garage             22481 non-null  object 
 13  Region             22481 non-null  object 
 14  PrimeCommerciale   22481 non-null  float64
dtypes: float64(4), object(11)
memory usage: 2.6+ MB


In [6]:
df_drift.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22481 entries, 0 to 22480
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PolicyId           22481 non-null  object 
 1   AgeConducteur      22481 non-null  float64
 2   SexeConducteur     22481 non-null  object 
 3   StatutMatrimonial  7372 non-null   object 
 4   BonusMalus         22481 non-null  float64
 5   FrequencePaiement  22481 non-null  object 
 6   CodeProfession     7372 non-null   object 
 7   AgeVehicule        22481 non-null  float64
 8   ClasseVehicule     22481 non-null  object 
 9   PuissanceVehicule  22481 non-null  object 
 10  CarburantVehicule  22481 non-null  object 
 11  UsageVehicule      22481 non-null  object 
 12  Garage             22481 non-null  object 
 13  Region             22481 non-null  object 
 14  PrimeCommerciale   22481 non-null  float64
dtypes: float64(4), object(11)
memory usage: 2.6+ MB


In [None]:
# drfit analysis 

" add a binary target column to indicate source dataset in train and drift datasets"
df_train['source'] = 0
df_drift['source'] = 1
df_combined = pd.concat([df_train, df_drift], ignore_index=True)


In [None]:
df_combined.info()

In [None]:
df_combined['source'].value_counts()

In [None]:
# train Xgboost classifier to detect drift tanks to AUC-ROC score
X = df_combined.drop('source', axis=1)
y = df_combined['source']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

In [None]:
y_pred_proba = xgb.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC score for drift detection: {auc_roc:.4f}")

In [None]:
frouros

In [12]:
import pandas as pd
from frouros.detectors.data_drift.batch import PSI

# 1. SÃ©lection des colonnes numÃ©riques
columns = df_train.select_dtypes(include=['number']).columns
psi_scores = {}

for col in columns:
    detector = PSI()
    detector.fit(X=df_train[col].values)
    result = detector.compare(X=df_drift[col].values)
    
    # --- EXTRACTION ULTRA-ROBUSTE ---
    # On vÃ©rifie si l'objet a un attribut 'distance', sinon on prend le premier Ã©lÃ©ment
    if hasattr(result, 'distance'):
        val = result.distance
    elif isinstance(result, (list, tuple)):
        val = result[0]
    else:
        # Au cas oÃ¹ l'objet est dÃ©jÃ  un nombre ou autre chose
        val = result
        
    # On s'assure que c'est bien un float pur
    psi_scores[col] = float(val)

# 2. Affichage propre
print(f"{'Colonne':<20} | {'PSI':<10} | {'Statut'}")
print("-" * 50)

for col, score in psi_scores.items():
    if score >= 0.25:
        status = "ðŸ”´ Drift Majeur"
    elif score >= 0.1:
        status = "ðŸŸ¡ Warning (DÃ©rive)"
    else:
        status = "ðŸŸ¢ Stable"
        
    print(f"{col:<20} | {score:.4f}     | {status}")

TypeError: float() argument must be a string or a real number, not 'DistanceResult'

In [None]:
df_train['Age']

In [7]:
import numpy as np
import pandas as pd
from typing import Union, List, Optional

def calculate_psi(dataset1: pd.DataFrame, 
                  dataset2: pd.DataFrame, 
                  columns: Union[str, List[str]], 
                  bins: int = 10,
                  epsilon: float = 1e-10) -> pd.DataFrame:
    """
    Calcule le Population Stability Index (PSI) entre deux datasets.
    
    ParamÃ¨tres:
    -----------
    dataset1 : pd.DataFrame
        Dataset de rÃ©fÃ©rence (baseline/training)
    dataset2 : pd.DataFrame
        Dataset Ã  comparer (test/production)
    columns : str ou List[str]
        Nom(s) de(s) colonne(s) Ã  analyser
    bins : int
        Nombre de bins pour les variables numÃ©riques (dÃ©faut: 10)
    epsilon : float
        Petite valeur pour Ã©viter division par zÃ©ro (dÃ©faut: 1e-10)
    
    Retourne:
    ---------
    pd.DataFrame : RÃ©sultats PSI pour chaque colonne
    """
    
    # Convertir columns en liste si c'est une seule colonne
    if isinstance(columns, str):
        columns = [columns]
    
    # VÃ©rifier que les colonnes existent
    missing_cols = set(columns) - set(dataset1.columns) - set(dataset2.columns)
    if missing_cols:
        raise ValueError(f"Colonnes manquantes: {missing_cols}")
    
    results = []
    
    for col in columns:
        # Extraire les colonnes
        ref = dataset1[col].copy()
        curr = dataset2[col].copy()
        
        # Identifier le type de colonne
        is_numeric = pd.api.types.is_numeric_dtype(ref) and pd.api.types.is_numeric_dtype(curr)
        
        # SÃ©parer valeurs manquantes et non-manquantes
        ref_missing_count = ref.isna().sum()
        curr_missing_count = curr.isna().sum()
        
        ref_valid = ref.dropna()
        curr_valid = curr.dropna()
        
        # Si tout est manquant, PSI = 0
        if len(ref_valid) == 0 and len(curr_valid) == 0:
            results.append({
                'colonne': col,
                'type': 'vide',
                'psi': 0.0,
                'missing_ref_%': 100.0,
                'missing_curr_%': 100.0,
                'interpretation': 'Pas de drift (tout manquant)'
            })
            continue
        
        # Calculer PSI pour les valeurs non-manquantes
        if is_numeric:
            psi_value = _calculate_psi_numeric(ref_valid, curr_valid, bins, epsilon)
            col_type = 'numÃ©rique'
        else:
            psi_value = _calculate_psi_categorical(ref_valid, curr_valid, epsilon)
            col_type = 'catÃ©gorielle'
        
        # Calculer PSI pour les missing values
        ref_total = len(ref)
        curr_total = len(curr)
        
        ref_missing_pct = ref_missing_count / ref_total if ref_total > 0 else 0
        curr_missing_pct = curr_missing_count / curr_total if curr_total > 0 else 0
        
        # Ajouter contribution des missing au PSI total
        if ref_missing_pct > 0 or curr_missing_pct > 0:
            ref_missing_pct = max(ref_missing_pct, epsilon)
            curr_missing_pct = max(curr_missing_pct, epsilon)
            psi_missing = (curr_missing_pct - ref_missing_pct) * np.log(curr_missing_pct / ref_missing_pct)
            psi_value += psi_missing
        
        # InterprÃ©tation du PSI
        interpretation = _interpret_psi(psi_value)
        
        results.append({
            'colonne': col,
            'type': col_type,
            'psi': round(psi_value, 4),
            'missing_ref_%': round(ref_missing_pct * 100, 2),
            'missing_curr_%': round(curr_missing_pct * 100, 2),
            'interpretation': interpretation
        })
    
    return pd.DataFrame(results)


def _calculate_psi_numeric(ref: pd.Series, curr: pd.Series, bins: int, epsilon: float) -> float:
    """Calcule PSI pour variables numÃ©riques."""
    
    # CrÃ©er les bins sur la distribution de rÃ©fÃ©rence
    try:
        _, bin_edges = np.histogram(ref, bins=bins)
    except:
        # Si problÃ¨me avec histogram, utiliser quantiles
        bin_edges = np.percentile(ref, np.linspace(0, 100, bins + 1))
    
    # S'assurer que les bins couvrent les valeurs actuelles
    bin_edges[0] = min(bin_edges[0], curr.min()) - epsilon
    bin_edges[-1] = max(bin_edges[-1], curr.max()) + epsilon
    
    # Calculer distributions
    ref_counts, _ = np.histogram(ref, bins=bin_edges)
    curr_counts, _ = np.histogram(curr, bins=bin_edges)
    
    # Convertir en proportions
    ref_props = ref_counts / len(ref)
    curr_props = curr_counts / len(curr)
    
    # Ajouter epsilon pour Ã©viter log(0)
    ref_props = np.where(ref_props == 0, epsilon, ref_props)
    curr_props = np.where(curr_props == 0, epsilon, curr_props)
    
    # Calculer PSI
    psi = np.sum((curr_props - ref_props) * np.log(curr_props / ref_props))
    
    return psi


def _calculate_psi_categorical(ref: pd.Series, curr: pd.Series, epsilon: float) -> float:
    """Calcule PSI pour variables catÃ©gorielles."""
    
    # Obtenir toutes les catÃ©gories uniques
    all_categories = set(ref.unique()) | set(curr.unique())
    
    # Calculer distributions
    ref_counts = ref.value_counts()
    curr_counts = curr.value_counts()
    
    psi = 0.0
    
    for cat in all_categories:
        ref_prop = ref_counts.get(cat, 0) / len(ref)
        curr_prop = curr_counts.get(cat, 0) / len(curr)
        
        # Ajouter epsilon pour Ã©viter log(0)
        ref_prop = max(ref_prop, epsilon)
        curr_prop = max(curr_prop, epsilon)
        
        psi += (curr_prop - ref_prop) * np.log(curr_prop / ref_prop)
    
    return psi


def _interpret_psi(psi: float) -> str:
    """InterprÃ¨te la valeur du PSI."""
    if psi < 0.1:
        return "Pas de drift significatif"
    elif psi < 0.2:
        return "Drift modÃ©rÃ© - surveillance recommandÃ©e"
    else:
        return "Drift important - action requise"

if __name__ == "__main__":
    # Calculer PSI
    psi_results = calculate_psi(
        dataset1=df_train,
        dataset2=df_drift,
        columns=df_train.columns.tolist(),
        bins=10
    )
    
    print("RÃ©sultats PSI:")
    print(psi_results.to_string(index=False))
    print("\nSeuils d'interprÃ©tation:")
    print("- PSI < 0.1  : Pas de drift")
    print("- PSI < 0.2  : Drift modÃ©rÃ©")
    print("- PSI >= 0.2 : Drift important")

RÃ©sultats PSI:
          colonne         type    psi  missing_ref_%  missing_curr_%                   interpretation
         PolicyId catÃ©gorielle 0.0000           0.00            0.00        Pas de drift significatif
    AgeConducteur    numÃ©rique 0.6408           0.00            0.00 Drift important - action requise
   SexeConducteur catÃ©gorielle 0.0000           0.00            0.00        Pas de drift significatif
StatutMatrimonial catÃ©gorielle 0.0000          67.21           67.21        Pas de drift significatif
       BonusMalus    numÃ©rique 2.0399           0.00            0.00 Drift important - action requise
FrequencePaiement catÃ©gorielle 1.6564           0.00            0.00 Drift important - action requise
   CodeProfession catÃ©gorielle 0.0000          67.21           67.21        Pas de drift significatif
      AgeVehicule    numÃ©rique 2.7682           0.00            0.00 Drift important - action requise
   ClasseVehicule catÃ©gorielle 7.3463           0.00     

In [8]:
import numpy as np

def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
    '''Calculate the PSI (population stability index) across all variables

    Args:
       expected: numpy matrix of original values
       actual: numpy matrix of new values
       buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
       buckets: number of quantiles to use in bucketing variables
       axis: axis by which variables are defined, 0 for vertical, 1 for horizontal

    Returns:
       psi_values: ndarray of psi values for each variable

    Author:
       Matthew Burke
       github.com/mwburke
       mwburke.github.io.com
    '''

    def psi(expected_array, actual_array, buckets):
        '''Calculate the PSI for a single variable

        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into

        Returns:
           psi_value: calculated PSI value
        '''

        def scale_range (input, min, max):
            input += -(np.min(input))
            input /= np.max(input) / (max - min)
            input += min
            return input

        breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

        if buckettype == 'bins':
            breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
        elif buckettype == 'quantiles':
            breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])

        expected_fractions = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
        actual_fractions = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

        def sub_psi(e_perc, a_perc):
            '''Calculate the actual PSI value from comparing the values.
               Update the actual value to a very small number if equal to zero
            '''
            if a_perc == 0:
                a_perc = 0.0001
            if e_perc == 0:
                e_perc = 0.0001

            value = (e_perc - a_perc) * np.log(e_perc / a_perc)
            return(value)

        psi_value = sum(sub_psi(expected_fractions[i], actual_fractions[i]) for i in range(0, len(expected_fractions)))

        return(psi_value)

    if len(expected.shape) == 1:
        psi_values = np.empty(len(expected.shape))
    else:
        psi_values = np.empty(expected.shape[1 - axis])

    for i in range(0, len(psi_values)):
        if len(psi_values) == 1:
            psi_values = psi(expected, actual, buckets)
        elif axis == 0:
            psi_values[i] = psi(expected[:,i], actual[:,i], buckets)
        elif axis == 1:
            psi_values[i] = psi(expected[i,:], actual[i,:], buckets)

    return(psi_values)

In [16]:
columns=df_train.columns.tolist()
columns

['PolicyId',
 'AgeConducteur',
 'SexeConducteur',
 'StatutMatrimonial',
 'BonusMalus',
 'FrequencePaiement',
 'CodeProfession',
 'AgeVehicule',
 'ClasseVehicule',
 'PuissanceVehicule',
 'CarburantVehicule',
 'UsageVehicule',
 'Garage',
 'Region',
 'PrimeCommerciale']

In [19]:
# Exemple d'utilisation
psi_results = calculate_psi(df_train, df_drift)

InvalidIndexError: (slice(None, None, None), 0)

In [22]:
from skorecard.reporting import psi
from skorecard.bucketers import DecisionTreeBucketer

# AprÃ¨s bucketing des donnÃ©es
psi_dict = psi(df_train, df_drift)
psi_dict 

{'PolicyId': 0.0,
 'AgeConducteur': 1.0563529912497733,
 'SexeConducteur': 0.0,
 'StatutMatrimonial': 0.0,
 'BonusMalus': 8.71912037523559,
 'FrequencePaiement': 0.6060633207404427,
 'CodeProfession': 0.0,
 'AgeVehicule': 3.4394590487522683,
 'ClasseVehicule': 2.4441539238701915,
 'PuissanceVehicule': 0.0,
 'CarburantVehicule': 0.0,
 'UsageVehicule': 0.0,
 'Garage': 0.0,
 'Region': 0.0,
 'PrimeCommerciale': 1.4379520858825219}