# CSI - Characteristic Stability Index
Vinícius Trevisan 2022

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Data prep
from sklearn.model_selection import train_test_split

In [3]:
# Regression
from sklearn.linear_model import ElasticNet

In [4]:
# Classification
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

## Functions

In [5]:
def psi(score_initial, score_new, num_bins = 10, mode = 'fixed'):
    
    eps = 1e-4
    
    # Sort the data
    score_initial.sort()
    score_new.sort()
    
    # Prepare the bins
    min_val = min(min(score_initial), min(score_new))
    max_val = max(max(score_initial), max(score_new))
    if mode == 'fixed':
        bins = [min_val + (max_val - min_val)*(i)/num_bins for i in range(num_bins+1)]
    elif mode == 'quantile':
        bins = pd.qcut(score_initial, q = num_bins, retbins = True)[1] # Create the quantiles based on the initial population
    else:
        raise ValueError(f"Mode \'{mode}\' not recognized. Your options are \'fixed\' and \'quantile\'")
    bins[0] = min_val - eps # Correct the lower boundary
    bins[-1] = max_val + eps # Correct the higher boundary
        
        
    # Bucketize the initial population and count the sample inside each bucket
    bins_initial = pd.cut(score_initial, bins = bins, labels = range(1,num_bins+1))
    df_initial = pd.DataFrame({'initial': score_initial, 'bin': bins_initial})
    grp_initial = df_initial.groupby('bin').count()
    grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial'])
    
    # Bucketize the new population and count the sample inside each bucket
    bins_new = pd.cut(score_new, bins = bins, labels = range(1,num_bins+1))
    df_new = pd.DataFrame({'new': score_new, 'bin': bins_new})
    grp_new = df_new.groupby('bin').count()
    grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new'])
    
    # Compare the bins to calculate PSI
    psi_df = grp_initial.join(grp_new, on = "bin", how = "inner")
    
    # Add a small value for when the percent is zero
    psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: eps if x == 0 else x)
    psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: eps if x == 0 else x)
    
    # Calculate the psi
    psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new'])
    
    # Return the psi values
    return psi_df['psi'].values

## Regressor

Now that we know that the population has shifted, we now need to know which features contributed to this shift.

CSI is basically applying PSI to each feature to see how much it has shifted

### Load Synthetic Dataset

In [6]:
df_test = pd.read_csv("data_test_regression.csv", sep = ";")

In [7]:
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred
0,-2.123824,2.817144,4.147786,25.990632,2.712129
1,0.613401,3.245827,4.001453,21.465114,5.059672
2,-0.39878,4.433262,7.096461,35.632961,5.346833
3,4.612874,3.029813,5.038264,15.605921,5.483961
4,-0.699526,3.271073,1.807677,18.206936,6.133364


In [8]:
Z = pd.read_csv("Z_regression.csv", sep = ";")

In [9]:
Z.head()

Unnamed: 0,x1,x2,x3,pred
0,-0.320133,4.304297,6.107282,1.247635
1,-1.191167,2.880619,8.274893,4.183234
2,-0.745133,4.14729,13.012441,6.449415
3,2.135551,3.800376,8.923503,7.366811
4,-0.313125,3.649272,9.209772,7.490958


### Calculates CSI for each feature

In [10]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]

In [11]:
# Fixed
print("CSI - Fixed size bins")
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

# Quantile
print("\nCSI - Quantile bins")
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

CSI - Fixed size bins
x1 -> csi=0.0031
x2 -> csi=0.0461
x3 -> csi=0.1711

CSI - Quantile bins
x1 -> csi=0.0032
x2 -> csi=0.0423
x3 -> csi=0.1272


## Classifier

Now that we know that the population has shifted, we now need to know which features contributed to this shift.

CSI is basically applying PSI to each feature to see how much it has shifted

### Load Synthetic Dataset

In [12]:
df_test = pd.read_csv("data_test_classification.csv", sep = ";")

In [13]:
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred,proba
0,0.699931,2.6382,7.410266,1,1,0.000777
1,0.255457,4.150482,6.79769,1,1,0.001957
2,3.847591,2.904744,3.158348,0,0,0.003488
3,3.347573,2.771801,7.484235,0,1,0.003622
4,-0.941602,1.85026,6.179757,1,1,0.003903


In [14]:
Z = pd.read_csv("Z_classification.csv", sep = ";")

In [15]:
Z.head()

Unnamed: 0,x1,x2,x3,pred,proba
0,-0.320133,4.304297,6.107282,1,9.6e-05
1,-1.191167,2.880619,8.274893,1,0.00028
2,-0.745133,4.14729,13.012441,1,0.001201
3,2.135551,3.800376,8.923503,1,0.001324
4,-0.313125,3.649272,9.209772,1,0.001439


### Calculates CSI for each feature

In [16]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]

In [17]:
# Fixed
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0016
x2 -> csi=0.0452
x3 -> csi=0.1764


In [18]:
# Quantile
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0014
x2 -> csi=0.0448
x3 -> csi=0.1320
