# CSI - Characteristic Stability Index
Vinícius Trevisan 2022

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Data prep
from sklearn.model_selection import train_test_split

In [3]:
# Regression
from sklearn.linear_model import ElasticNet

In [4]:
# Classification
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

## Functions

In [5]:
def psi(score_initial, score_new, num_bins = 10, mode = 'fixed'):
    
    eps = 1e-4
    
    # Sorts the data
    score_initial.sort()
    score_new.sort()
    
    # Prepare the bins
    min_val = min(min(score_initial), min(score_new))
    max_val = max(max(score_initial), max(score_new))
    if mode == 'fixed':
        bins = [min_val + (max_val - min_val)*(i)/num_bins for i in range(num_bins+1)]
    elif mode == 'quantile':
        bins = pd.qcut(score_initial, q = num_bins, retbins = True)[1] # Create the quantiles based on the initial population
    else:
        raise ValueError(f"Mode \'{mode}\' not recognized. Your options are \'fixed\' and \'quantile\'")
    bins[0] = min_val - eps # Corrects the lower boundary
    bins[-1] = max_val + eps # Corrects the higher boundary
        
        
    # Bucketize the initial population and count the sample inside each bucket
    bins_initial = pd.cut(score_initial, bins = bins, labels = range(1,num_bins+1))
    df_initial = pd.DataFrame({'initial': score_initial, 'bin': bins_initial})
    grp_initial = df_initial.groupby('bin').count()
    grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial'])
    
    # Bucketize the new population and count the sample inside each bucket
    bins_new = pd.cut(score_new, bins = bins, labels = range(1,num_bins+1))
    df_new = pd.DataFrame({'new': score_new, 'bin': bins_new})
    grp_new = df_new.groupby('bin').count()
    grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new'])
    
    # Compares the bins to calculate PSI
    psi_df = grp_initial.join(grp_new, on = "bin", how = "inner")
    
    # Adds a small value for when the percent is zero
    psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: eps if x == 0 else x)
    psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: eps if x == 0 else x)
    
    # Calculates the psi
    psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new'])
    
    # Returns the psi values
    return psi_df['psi'].values

## Regressor

Now that we know that the population has shifted, we now need to know which features contributed to this shift.

CSI is basically applying PSI to each feature to see how much it has shifted

### Load Synthetic Dataset

In [6]:
df_test = pd.read_csv("data_test_regression.csv", sep = ";")

In [7]:
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred
0,-0.999027,4.483249,6.718647,35.963477,7.379817
1,0.062492,3.681372,2.476577,19.790126,8.525168
2,-1.190431,1.755692,8.585721,32.852933,8.925085
3,1.396876,3.125114,4.685093,21.377283,9.35933
4,-2.581283,3.012773,3.254657,25.092326,9.383212


In [8]:
Z = pd.read_csv("Z_regression.csv", sep = ";")

In [9]:
Z.head()

Unnamed: 0,x1,x2,x3,pred
0,-0.593373,3.766731,7.724218,3.372005
1,-0.056501,3.037664,5.484879,3.393188
2,-2.49231,3.232186,6.536631,8.109921
3,-0.857154,4.159045,7.311428,8.441733
4,1.239678,3.459524,4.456351,8.710473


### Calculates CSI for each feature

In [10]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]

In [11]:
# Fixed
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0018
x2 -> csi=0.0462
x3 -> csi=0.1990


In [12]:
# Quantile
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0018
x2 -> csi=0.0466
x3 -> csi=0.1247


## Classifier

Now that we know that the population has shifted, we now need to know which features contributed to this shift.

CSI is basically applying PSI to each feature to see how much it has shifted

### Load Synthetic Dataset

In [13]:
df_test = pd.read_csv("data_test_classification.csv", sep = ";")

In [14]:
df_test.head()

Unnamed: 0,x1,x2,x3,y,pred,proba
0,-1.025191,2.370938,3.957025,0,0,0.000276
1,-3.97046,2.804296,7.962128,1,1,0.002381
2,-0.320773,3.38005,3.367016,0,0,0.002411
3,4.546295,4.407173,-0.474749,0,0,0.003654
4,-4.506757,2.865014,4.888718,1,1,0.004353


In [15]:
Z = pd.read_csv("Z_classification.csv", sep = ";")

In [16]:
Z.head()

Unnamed: 0,x1,x2,x3,pred,proba
0,-0.593373,3.766731,7.724218,1,0.000573
1,-0.056501,3.037664,5.484879,1,0.000724
2,-2.49231,3.232186,6.536631,1,0.003062
3,-0.857154,4.159045,7.311428,1,0.003305
4,1.239678,3.459524,4.456351,0,0.003656


### Calculates CSI for each feature

In [17]:
sample_initial = df_test[['x1', 'x2', 'x3']]
sample_new = Z[['x1', 'x2', 'x3']]

In [18]:
# Fixed
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'fixed')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0013
x2 -> csi=0.0481
x3 -> csi=0.1485


In [19]:
# Quantile
for col in sample_initial.columns:
    csi_values = psi(sample_initial[col].values, sample_new[col].values, num_bins = 10, mode = 'quantile')
    csi = np.mean(csi_values)
    print(f'{col} -> {csi=:.4f}')

x1 -> csi=0.0016
x2 -> csi=0.0462
x3 -> csi=0.1183
