In [1]:
import pandas as pd
import numpy as np

In [3]:
def cal_psi(actual, predict, bins=10):
    """
    param:
        actual: 实际值、较早的特征值（如：9月）
        predict: 预测值、较晚的特征值（如：10月）
        bin: bin数
    return:
        psi: psi
        psi_df: 每个bin的psi
    """
    actual_min = actual.min()  
    actual_max = actual.max()  
    binlen = (actual_max - actual_min) / bins   
    cuts = [actual_min + i * binlen for i in range(1, bins)]
    cuts.insert(0, -float("inf"))
    cuts.append(float("inf"))
    actual_cuts = np.histogram(actual, bins=cuts)
    predict_cuts = np.histogram(predict, bins=cuts)
    
    actual_df = pd.DataFrame(actual_cuts[0],columns=['actual'])
    predict_df = pd.DataFrame(predict_cuts[0], columns=['predict'])
    psi_df = pd.merge(actual_df,predict_df,right_index=True,left_index=True) 
    psi_df['actual_rate'] = (psi_df['actual'] + 1) / psi_df['actual'].sum()
    psi_df['predict_rate'] = (psi_df['predict'] + 1) / psi_df['predict'].sum()
    psi_df['psi'] = (psi_df['actual_rate'] - psi_df['predict_rate']) * np.log(psi_df['actual_rate'] / psi_df['predict_rate'])
    psi = psi_df['psi'].sum()
    return psi, psi_df

In [4]:
df = pd.read_csv('data/data.csv')

In [36]:
psi,psi_df = cal_psi(df.iloc[0:10000]['c21'],df.iloc[10000:20000]['c21']) #假设前1w行是2019_08的数据，1w~到2w行是2019_09的数据

In [37]:
psi

0.0007356118558368861

In [38]:
psi_df

Unnamed: 0,actual,predict,actual_rate,predict_rate,psi
0,8786,8821,0.8787,0.8822,1.4e-05
1,842,820,0.0843,0.0821,5.8e-05
2,241,225,0.0242,0.0226,0.000109
3,79,77,0.008,0.0078,5e-06
4,25,28,0.0026,0.0029,3.3e-05
5,13,18,0.0014,0.0019,0.000153
6,3,2,0.0004,0.0003,2.9e-05
7,7,3,0.0008,0.0004,0.000277
8,2,3,0.0003,0.0004,2.9e-05
9,2,3,0.0003,0.0004,2.9e-05
