### Computing PAI with resampling methods:

#### In Linear Regression:

In [None]:
import numpy as np
import pandas as pd

Old Data 

In [None]:
np.random.seed(123)

# Quantitative Response

Y_old = np.random.normal(loc=50, scale=10, size=500)

# Quantitative variables 

X1_old = np.random.normal(loc=30, scale=25, size=500)
 
# Binary variables 

X2_old = np.random.uniform(low=0.0, high=1.0, size=500).round()
 
# Multiclass categorical variables

X3_old = np.random.uniform(low=0, high=4, size=500).round()   # categories: 0,1,2,3,4
 

New Data (with a big change in X1 distribution)

In [None]:
np.random.seed(666)

# Quantitative Response

Y_new = np.random.normal(loc=50, scale=10, size=500)

# Quantitative variables 

X1_new = np.random.normal(loc=15, scale=60, size=500)
 
# Binary variables 

X2_new = np.random.uniform(low=0.0, high=1.0, size=500).round()
 
# Multiclass categorical variables

X3_new = np.random.uniform(low=0, high=4, size=500).round()   # categories: 0,1,2,3,4

In [None]:
df_Old = pd.DataFrame( {"Y":Y_old , "X1": X1_old , "X2": X2_old , "X3": X3_old} ) 

df_New = pd.DataFrame( {"Y":Y_new , "X1": X1_new , "X2": X2_new , "X3": X3_new} ) 

In [None]:
from plotnine import ggplot, aes, geom_line, geom_point, geom_histogram, geom_bar, geom_boxplot, scale_y_continuous, scale_x_continuous, labs, after_stat,  geom_vline, scale_color_manual, theme_gray, theme_xkcd, scale_color_identity, geom_hline, facet_wrap, scale_fill_discrete, scale_fill_manual,  scale_fill_hue, guides, guide_legend, ggtitle
from mizani.formatters import percent_format 

In [None]:
import array as arr

df_Old_New = pd.concat([df_Old , df_New])

repeat_Old = ['Old Data']*len(Old_Data_Set)

repeat_New = ['New Dta']*len(New_Data_Set)

df_repeat_New = pd.DataFrame( {"group": repeat_New} ) 

df_repeat_Old = pd.DataFrame( {"group": repeat_Old} ) 

groups = pd.concat([df_repeat_Old , df_repeat_New])

df_Old_New_groups = pd.concat([df_Old_New , groups], axis=1 ) 

df_Old_New_groups

In [None]:
(
ggplot( df_Old_New_groups )
+  aes(x='X1' , y =  after_stat('width*density'))
+ geom_histogram(fill="plum", color="black", bins = 25)
+  labs(x = "X1", y = "Frecuencia Relativa")
+ scale_x_continuous( breaks = range(int(df_Old_New_groups['X1'].min()) , int(df_Old_New_groups['X1'].max()) , 50) ) 
+ scale_y_continuous( breaks = np.arange(0, 0.5, 0.02) )
+ facet_wrap('group')
)

We are going to consider the following definition of PAI (instead of use the variance, we will use the standard deviation. if we would consider the PAI definition with the variance, the process to compute it would have been very similar)

Where:

The numerator is computing using the model trained with the Old Data (for the response and the predictors) but predicting the response variable using the New Data for the predictors.

The denominator is computing using the model trained Old Data (for the response and the predictors) and also predicting the response variable using the Old Data for the predictors.

$$ 

PAI = \dfrac{\dfrac{1}{N} \sum_{i \in NewData} \widehat{Var}(\hat{y}_i)}{\dfrac{1}{n} \sum_{i \in OldData} \widehat{Var}(\hat{y}_i)}  

$$

In [None]:
def varcharProcessing(X, varchar_process = "dummy_dropfirst"):
    
    dtypes = X.dtypes

    if varchar_process == "drop":   
        X = X.drop(columns = dtypes[dtypes == np.object].index.tolist())

    elif varchar_process == "dummy":
        X = pd.get_dummies(X,drop_first=False)

    elif varchar_process == "dummy_dropfirst":
        X = pd.get_dummies(X,drop_first=True)

    else: 
        X = pd.get_dummies(X,drop_first=True)
    
    X["intercept"] = 1
    cols = X.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    X = X[cols]
    
    return X

In [None]:
df_Old['X2'] = df_Old['X2'].astype('category')
df_Old['X3'] = df_Old['X3'].astype('category')

In [None]:
B=100

y_predictions_Old_Data = np.zeros((B , len(df_Old)))

for i in range(0, B):

    df_Old_BOOT_SAMPLE = df_Old.sample( n=len(df_Old) , random_state=i , replace=True )    # i-th boot sample

    X_old = df_Old_BOOT_SAMPLE[['X1', 'X2', 'X3']]

    y_old = df_Old_BOOT_SAMPLE['Y']

    X_old = varcharProcessing(X_old, varchar_process = "dummy_dropfirst")

    # We train the model with i-th boot sample of the Old Data:

    Model_train_Old_data = LinearRegression().fit(X_old, y_old)

    # y predictions using Model_train_Old_data with the i-th boot sample of the Old Data for the predictors 

    y_predictions_Old_Data[i, :] = Model_train_Old_data.predict(X_old)    

The $(k , r)$ element of the $nxB$ matrix `y_predictions_Old_Data` is $\widehat{y_k}$ (the $Y$ estimation for the $k$-th individual of the sample) when the model is trained with the $r$-th boot sample of  `Old_Data_Set` 

Where: 

$n =$ len(Old\_Data\_Set)

$B=$ nº of boot samples

In [None]:
y_predictions_Old_Data 

We compute the standard deviation of each column of the matrix , and we get an estimation of $Var(\hat{y_i})$ for $i=1,...,n$:

So, the i-th value of the following vector is $$\widehat{Var}(\hat{y_i})$$

In [None]:
 # compute the variance by cols in an array
 
y_predictions_Old_Data.var(axis=0) 

In [None]:
len(y_predictions_Old_Data.var(axis=0))


Now, we compute the mean of the previous vector:


$$ \dfrac{1}{n} \cdot \sum_{i=1,...,n} \widehat{Var}(\hat{y_i})$$


In [None]:
y_predictions_Old_Data.var(axis=0).mean()  


In [None]:
PAI_denominator = y_predictions_Old_Data.var(axis=0).mean()  


We repeat the previous process but now we get the response predictions for de predictors of the New_Data_Set (this is so important, taking into a count the PAI definitions).

In [None]:
df_New['X2'] = df_New['X2'].astype('category')
df_New['X3'] = df_New['X3'].astype('category')

In [None]:
B=100

y_predictions_New_Data = np.zeros((B , len(New_Data_Set)))

for i in range(0, B):

    # i-th boot sample of the Old Data

    df_Old_BOOT_SAMPLE = df_Old.sample( n=len(df_Old) , random_state=i , replace=True ) 
 

    
    X_old = df_Old_BOOT_SAMPLE[['X1', 'X2', 'X3']]  

    y_old =  df_Old_BOOT_SAMPLE['Y']

    X_old = varcharProcessing(X_old, varchar_process = "dummy_dropfirst")

    

    X_new = df_New[['X1', 'X2', 'X3']]

    y_new = df_New['Y']

    X_new = varcharProcessing(X_new, varchar_process = "dummy_dropfirst") 


    Model_Old_Boot_Sample = LinearRegression().fit(X_old, y_old)

    
    # y predictions for the New Data using the model trained with the Old Data Boot Sample
    # For this step with sk-learn is necessary X_new (test_set) columns have the same name as X_old (train set) columns

    y_predictions_New_Data[i, :] = Model_Old_Boot_Sample.predict(X_new)    

In [None]:
PAI_numerator = y_predictions_New_Data.var(axis=0).mean()  

In [None]:
PAI = PAI_numerator / PAI_denominator
PAI

Then, in mean, the variance of the response predictions using the New Data Set is 1.17 times greater than the variance of the response predictions using the Old Data Set.

Following the interpretation "values less than 1.1 indicate no significant deterioration; values from 1.1 to 1.5 indicate a deterioration requiring further investigation, values exceeding 1.5 indicate the predictive
accuracy of the model has deteriorated significantly" exposed in the paper `The Population Accuracy Index: A New Measure of
Population Stability for Model Monitoring` , so, the PAI value that we have got indicates a deterioration of  the model predictive
accuracy , so could be recommendable to train again the model using the New Data Set instead the Old.