In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Data prep
from sklearn.model_selection import train_test_split
# Regression
from sklearn.linear_model import ElasticNet
# Classification
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
def psi(score_initial, score_new, num_bins = 10, mode = 'fixed'):
    
    eps = 1e-4
    
    # Sort the data
    score_initial.sort()
    score_new.sort()
    
    # Prepare the bins
    min_val = min(min(score_initial), min(score_new))
    max_val = max(max(score_initial), max(score_new))
    if mode == 'fixed':
        bins = [min_val + (max_val - min_val)*(i)/num_bins for i in range(num_bins+1)]
    elif mode == 'quantile':
        bins = pd.qcut(score_initial, q = num_bins, retbins = True)[1] # Create the quantiles based on the initial population
    else:
        raise ValueError(f"Mode \'{mode}\' not recognized. Your options are \'fixed\' and \'quantile\'")
    bins[0] = min_val - eps # Correct the lower boundary
    bins[-1] = max_val + eps # Correct the higher boundary
        
        
    # Bucketize the initial population and count the sample inside each bucket
    bins_initial = pd.cut(score_initial, bins = bins, labels = range(1,num_bins+1))
    df_initial = pd.DataFrame({'initial': score_initial, 'bin': bins_initial})
    grp_initial = df_initial.groupby('bin').count()
    grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial'])
    
    # Bucketize the new population and count the sample inside each bucket
    bins_new = pd.cut(score_new, bins = bins, labels = range(1,num_bins+1))
    df_new = pd.DataFrame({'new': score_new, 'bin': bins_new})
    grp_new = df_new.groupby('bin').count()
    grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new'])
    
    # Compare the bins to calculate PSI
    psi_df = grp_initial.join(grp_new, on = "bin", how = "inner")
    
    # Add a small value for when the percent is zero
    psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: eps if x == 0 else x)
    psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: eps if x == 0 else x)
    
    # Calculate the psi
    psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new'])
    
    # Return the psi values
    return psi_df['psi'].values

Now that we know that the population has shifted, we now need to know which features contributed to this shift.

CSI is basically applying PSI to each feature to see how much it has shifted

### Load Synthetic Dataset

In [3]:
df_test = pd.read_csv("data_test_regression.csv", sep = ";")

In [4]:
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred
0,1.292155,1.238676,4.440749,14.996278,7.02904
1,1.843959,3.286081,7.546199,28.766137,7.235099
2,-3.006904,2.972508,6.445045,34.489508,8.006933
3,0.884088,1.212694,6.73448,21.965838,8.189244
4,1.009362,3.980044,2.753569,19.587612,9.24767


In [5]:
Z = pd.read_csv("Z_regression.csv", sep = ";")

In [6]:
Z.head()

Unnamed: 0,x1,x2,x3,pred
0,-1.505888,3.164592,3.976233,3.456376
1,0.621893,3.750874,6.396892,3.573458
2,3.437573,4.605822,9.917134,7.503598
3,-2.361256,3.839147,5.48119,8.344179
4,-0.084735,3.285693,6.135092,8.425915


### Calculates CSI for each feature

In [7]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]

In [8]:
# Fixed
print("CSI - Fixed size bins")
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

# Quantile
print("\nCSI - Quantile bins")
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

CSI - Fixed size bins
x1 -> csi=0.0008
x2 -> csi=0.0444
x3 -> csi=0.1663

CSI - Quantile bins
x1 -> csi=0.0014
x2 -> csi=0.0474
x3 -> csi=0.1296


  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()


Now that we know that the population has shifted, we now need to know which features contributed to this shift.

CSI is basically applying PSI to each feature to see how much it has shifted

### Load Synthetic Dataset

In [9]:
df_test = pd.read_csv("data_test_classification.csv", sep = ";")

In [10]:
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred,proba
0,-1.99972,3.719734,4.617562,1,1,0.001153
1,0.619975,2.673335,3.582327,0,0,0.001866
2,2.422368,3.925465,6.461362,1,1,0.001866
3,-0.032314,0.599963,7.703891,1,1,0.003722
4,-0.426713,3.410179,5.007102,1,1,0.004268


In [11]:
Z = pd.read_csv("Z_classification.csv", sep = ";")

In [12]:
Z.head()

Unnamed: 0,x1,x2,x3,pred,proba
0,-1.505888,3.164592,3.976233,1,0.000472
1,0.621893,3.750874,6.396892,1,0.000577
2,3.437573,4.605822,9.917134,1,0.002097
3,-2.361256,3.839147,5.48119,1,0.002693
4,-0.084735,3.285693,6.135092,1,0.002833


### Calculates CSI for each feature

In [13]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]

In [14]:
# Fixed
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0010
x2 -> csi=0.0407
x3 -> csi=0.1641


  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()


In [15]:
# Quantile
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0016
x2 -> csi=0.0408
x3 -> csi=0.1302


  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
  grp_initial = df_initial.groupby('bin').count()
  grp_new = df_new.groupby('bin').count()
