In [1]:
# set seed
import numpy as np

np.random.seed(0)

from sklearn.datasets import load_iris
import pandas as pd

# Load the iris dataset
iris = load_iris()

# Create a dataframe from the iris dataset
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add the target variable to the dataframe
df["target"] = iris.target
df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]

df["sepal_length_bin"] = pd.cut(
    df["sepal_length"], bins=[0, 5, 6, 7, 10], labels=["0-5", "5-6", "6-7", "7-10"]
)

# Display the dataframe
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length_bin
0,5.1,3.5,1.4,0.2,0,5-6
1,4.9,3.0,1.4,0.2,0,0-5
2,4.7,3.2,1.3,0.2,0,0-5
3,4.6,3.1,1.5,0.2,0,0-5
4,5.0,3.6,1.4,0.2,0,0-5
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,6-7
146,6.3,2.5,5.0,1.9,2,6-7
147,6.5,3.0,5.2,2.0,2,6-7
148,6.2,3.4,5.4,2.3,2,6-7


In [2]:
df["sepal_length_bin"].value_counts()

sepal_length_bin
5-6     57
6-7     49
0-5     32
7-10    12
Name: count, dtype: int64

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)
print(train.shape)
print(test.shape)

(120, 6)
(30, 6)


In [11]:
import numpy as np

adf = pd.DataFrame()
adf.index = df["sepal_length_bin"].unique()
adf["train"] = train["sepal_length_bin"].value_counts()
adf["test"] = test["sepal_length_bin"].value_counts()

adf["train_norm"] = adf["train"] / adf["train"].sum()
adf["test_norm"] = adf["test"] / adf["test"].sum()

adf["diff"] = adf["test_norm"] - adf["train_norm"]
adf["ln_ratio"] = (adf["test_norm"].div(adf["train_norm"])).apply(lambda x: np.log(x))

adf["psi"] = adf["diff"] * adf["ln_ratio"]

print(f"{adf["psi"].sum() = }")
adf

adf["psi"].sum() = 0.14364444574310387


Unnamed: 0,train,test,train_norm,test_norm,diff,ln_ratio,psi
5-6,48,9,0.4,0.3,-0.1,-0.287682,0.028768
0-5,23,9,0.191667,0.3,0.108333,0.448025,0.048536
6-7,38,11,0.316667,0.366667,0.05,0.146603,0.00733
7-10,11,1,0.091667,0.033333,-0.058333,-1.011601,0.05901


### test

In [5]:
def sub_psi(e_perc, a_perc):
    """Calculate the actual PSI value from comparing the values.
    Update the actual value to a very small number if equal to zero
    """
    # formula is wrong direction
    if a_perc == 0:
        a_perc = 0.0001
    if e_perc == 0:
        e_perc = 0.0001

    value = (e_perc - a_perc) * np.log(e_perc / a_perc)
    return value


actual_percents = adf["train_norm"]
expected_percents = adf["test_norm"]

# actual_percents = adf["test_norm"]
# expected_percents = adf["train_norm"]

tot = 0
for i in range(0, len(expected_percents)):
    res = sub_psi(expected_percents.iloc[i], actual_percents.iloc[i])
    print(f"{res = }")
    tot += res

print(f"{tot = }")

res = 0.028768207245178118
res = 0.04853601160708735
res = 0.007330173709593771
res = 0.05901005318124465
tot = 0.14364444574310387


In [6]:
def sub_psi(e_perc, a_perc):
    """Calculate the actual PSI value from comparing the values.
    Update the actual value to a very small number if equal to zero
    """
    # formula is wrong direction
    if a_perc == 0:
        a_perc = 0.0001
    if e_perc == 0:
        e_perc = 0.0001

    value = (e_perc - a_perc) * np.log(e_perc / a_perc)
    return value


actual_percents = adf["test_norm"]
expected_percents = adf["train_norm"]

tot = 0
for i in range(0, len(expected_percents)):
    res = sub_psi(expected_percents.iloc[i], actual_percents.iloc[i])
    print(f"{res = }")
    tot += res

print(f"{tot = }")

res = 0.02876820724517811
res = 0.04853601160708735
res = 0.007330173709593768
res = 0.05901005318124465
tot = 0.14364444574310387


### detailed testing funcs for df use

In [94]:
import numpy as np

In [95]:
df = pd.read_csv(r"..\..\..\data\toydata\taxis.csv")
df["pickup"] = pd.to_datetime(df["pickup"])
df["day"] = df["pickup"].dt.date
df["day"].value_counts().head()

day
2019-03-14    260
2019-03-06    257
2019-03-13    244
2019-03-01    241
2019-03-08    235
Name: count, dtype: int64

In [96]:
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough,day
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan,2019-03-23
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan,2019-03-04
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan,2019-03-27
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan,2019-03-10
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan,2019-03-30


In [97]:
df["payment"].value_counts()

payment
credit card    4577
cash           1812
Name: count, dtype: int64

In [99]:
taxi = df.copy()

In [None]:
# funcs

In [52]:
import datetime

In [116]:
def calculate_psi_from_2_srs(srs_train, srs_test, fill_value=0.01):

    # normalise
    srs_train_norm = srs_train / srs_train.sum()
    srs_test_norm = srs_test / srs_test.sum()

    # ensure common index, and fill missing values
    common_index = srs_train_norm.index.union(srs_test_norm.index)
    srs_train_norm = srs_train_norm.reindex(common_index, fill_value=fill_value)
    srs_test_norm = srs_test_norm.reindex(common_index, fill_value=fill_value)

    # calc
    srs_diff = srs_test_norm - srs_train_norm
    srs_log_ratio = (srs_test_norm.div(srs_train_norm)).apply(lambda x: np.log(x))

    srs_psi = srs_diff * srs_log_ratio

    # return srs_psi.sum()
    df_of_srs = pd.DataFrame(
        {
            "train": srs_train,
            "test": srs_test,
            "train_norm": srs_train_norm,
            "test_norm": srs_test_norm,
            "diff": srs_diff,
            "ln_ratio": srs_log_ratio,
            "psi": srs_psi,
        }
    )
    return df_of_srs

In [101]:
index = "day"
col = "passengers"
df = taxi
srs = df.groupby(index)[col].value_counts()
srs

day         passengers
2019-02-28  1               1
2019-03-01  1             174
            2              29
            3              11
            5               9
                         ... 
2019-03-31  3               9
            5               9
            4               5
            0               4
            6               3
Name: count, Length: 214, dtype: int64

In [102]:
s1 = srs.loc[(datetime.date(2019, 3, 12),)]
s1

passengers
1    168
2     20
3     12
5     10
0      3
6      3
4      2
Name: count, dtype: int64

In [106]:
s2 = srs.loc[(datetime.date(2019, 3, 21),)]
s2

passengers
1    175
2     25
5      8
3      5
4      4
6      4
Name: count, dtype: int64

In [131]:
srs_train = s1
srs_test = s2
calculate_psi_from_2_srs(srs_train, srs_test)

Unnamed: 0_level_0,train,test,train_norm,test_norm,diff,ln_ratio,psi
passengers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,,0.013761,0.001,-0.012761,-2.621873,0.033459
1,168,175.0,0.770642,0.791855,0.021213,0.027154,0.000576
2,20,25.0,0.091743,0.113122,0.021379,0.209476,0.004478
3,12,5.0,0.055046,0.022624,-0.032421,-0.889136,0.028827
4,2,4.0,0.009174,0.0181,0.008925,0.67948,0.006065
5,10,8.0,0.045872,0.036199,-0.009672,-0.236811,0.002291
6,3,4.0,0.013761,0.0181,0.004338,0.274014,0.001189


In [132]:
calculate_psi_from_2_srs(srs_train, srs_test, fill_value=0.000001)

Unnamed: 0_level_0,train,test,train_norm,test_norm,diff,ln_ratio,psi
passengers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,,0.013761,1e-06,-0.01376,-9.529628,0.131132
1,168,175.0,0.770642,0.791855,0.021213,0.027154,0.000576
2,20,25.0,0.091743,0.113122,0.021379,0.209476,0.004478
3,12,5.0,0.055046,0.022624,-0.032421,-0.889136,0.028827
4,2,4.0,0.009174,0.0181,0.008925,0.67948,0.006065
5,10,8.0,0.045872,0.036199,-0.009672,-0.236811,0.002291
6,3,4.0,0.013761,0.0181,0.004338,0.274014,0.001189


In [133]:
calculate_psi_from_2_srs(srs_train, srs_test, fill_value=0.01)

Unnamed: 0_level_0,train,test,train_norm,test_norm,diff,ln_ratio,psi
passengers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,,0.013761,0.01,-0.003761,-0.319287,0.001201
1,168,175.0,0.770642,0.791855,0.021213,0.027154,0.000576
2,20,25.0,0.091743,0.113122,0.021379,0.209476,0.004478
3,12,5.0,0.055046,0.022624,-0.032421,-0.889136,0.028827
4,2,4.0,0.009174,0.0181,0.008925,0.67948,0.006065
5,10,8.0,0.045872,0.036199,-0.009672,-0.236811,0.002291
6,3,4.0,0.013761,0.0181,0.004338,0.274014,0.001189


In [134]:
calculate_psi_from_2_srs(srs_train, srs_test, fill_value=0.000000001)

Unnamed: 0_level_0,train,test,train_norm,test_norm,diff,ln_ratio,psi
passengers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,,0.013761,1e-09,-0.013761,-16.437383,0.226203
1,168,175.0,0.770642,0.7918552,0.021213,0.027154,0.000576
2,20,25.0,0.091743,0.1131222,0.021379,0.209476,0.004478
3,12,5.0,0.055046,0.02262443,-0.032421,-0.889136,0.028827
4,2,4.0,0.009174,0.01809955,0.008925,0.67948,0.006065
5,10,8.0,0.045872,0.0361991,-0.009672,-0.236811,0.002291
6,3,4.0,0.013761,0.01809955,0.004338,0.274014,0.001189


### test on a df

In [163]:
def calculate_psi_from_2_num_srs(srs_train, srs_test, unmatched_cat_fill_val=0.01):

    # normalise
    srs_train_norm = srs_train / srs_train.sum()
    srs_test_norm = srs_test / srs_test.sum()

    # ensure common index, and fill missing values
    common_index = srs_train_norm.index.union(srs_test_norm.index)
    srs_train_norm = srs_train_norm.reindex(
        common_index, fill_value=unmatched_cat_fill_val
    )
    srs_test_norm = srs_test_norm.reindex(
        common_index, fill_value=unmatched_cat_fill_val
    )

    # calc
    srs_diff = srs_test_norm - srs_train_norm
    srs_log_ratio = (srs_test_norm.div(srs_train_norm)).apply(lambda x: np.log(x))

    srs_psi = srs_diff * srs_log_ratio

    return srs_psi.sum()

In [140]:
taxi.columns

Index(['pickup', 'dropoff', 'passengers', 'distance', 'fare', 'tip', 'tolls',
       'total', 'color', 'payment', 'pickup_zone', 'dropoff_zone',
       'pickup_borough', 'dropoff_borough', 'day'],
      dtype='object')

In [158]:
expected_portions = taxi["passengers"].value_counts(normalize=True)
expected_portions

passengers
1    0.727188
2    0.136173
5    0.043059
3    0.037774
6    0.023784
4    0.017099
0    0.014923
Name: proportion, dtype: float64

In [164]:
group_df = taxi.groupby("day")["passengers"].apply(
    lambda x: calculate_psi_from_2_num_srs(x.value_counts(), expected_portions)
)
group_df.sort_values(ascending=False).head(10)

day
2019-02-28    0.519290
2019-03-03    0.061690
2019-03-17    0.051447
2019-03-15    0.044249
2019-03-26    0.042994
2019-03-10    0.040596
2019-03-12    0.037262
2019-03-30    0.036881
2019-03-24    0.036812
2019-03-18    0.033585
Name: passengers, dtype: float64

In [161]:
taxi.query("day == datetime.date(2019,2,28)")["passengers"].value_counts()

passengers
1    1
Name: count, dtype: int64

In [162]:
taxi.query("day == datetime.date(2019,3,3)")["passengers"].value_counts(normalize=True)

passengers
1    0.698225
2    0.201183
6    0.035503
5    0.029586
3    0.017751
4    0.011834
0    0.005917
Name: proportion, dtype: float64