In [2]:
#This functions creates two popular plots for quickly visualizing a model and trying to spot patterns in places where the fit is not great. 
#The plots are true responses vs. predictions and residuals vs. predictions. 
#y is assumed to be a DataFrame and predictions a numpy array (common sklearn setup). 

def plot_pred_res(predictions, y, size = .5): 
    pred_tmp = predictions
    if len(predictions.shape) > 1:
        pred_tmp = predictions[:, 0]
    residuals = np.array(y[y.columns[0]]) - pred_tmp
    fig, ax = plt.subplots(1, 2, figsize = (8, 4))
    ax[0].set_xlabel('Response')
    ax[0].set_ylabel('Predictions')
    ax[0].scatter(y, pred_tmp, s = size)
    ax[0].plot([y[y.columns[0]].min(), y[y.columns[0]].max()], [y[y.columns[0]].min(), y[y.columns[0]].max()], color = 'red', linestyle = '--')
    ax[1].set_xlabel('Predictions')
    ax[1].set_ylabel('Residuals')
    ax[1].scatter(pred_tmp, residuals, s = size)
    ax[1].plot([predictions.min(), predictions.max()], residuals.mean().mean()*np.ones(2), color = 'red', linestyle = '--')
    return fig, ax

In [3]:
#This function estimates the variance of an estimator func, using data from a DataFrame df. 
#The function func is assumed to take as input a DataFrame. 
#It applies a bootstrap approach. 

def boot_var(func, df, n=None, B=1000, seed=0):
    rng = np.random.default_rng(seed)
    first_, second_ = 0, 0
    n = n or df.shape[0]
    for _ in range(B):
        idx = rng.choice(df.index, n, replace=True)
        value = func(df.loc[idx])
        first_ += value
        second_ += value**2
    return second_ / B - (first_ / B)**2

In [4]:
#The empty template for creating a custom sklearn transformer. 
#It is nice to use the same signature for the fit and transform function 
#(using default None values if needed), for this enables the fit_transform() method. 

class DistanceToCities(BaseEstimator, TransformerMixin):
    def __init__(self): #If one needs to set attributes when summoning an instance of the transformer, they should be passed here. 
        
    def fit(self, X=None, y=None):
        return self

    def transform(self, X, y=None): 
        return X

IndentationError: expected an indented block after function definition on line 6 (220186999.py, line 8)

In [5]:
def MSE(estimator, X, y): #
    #The estimator is an sklearn estimator. It is assumed to be already fit. 
    #y is assumed to be a DataFrame with a single column: the observed response. 
    #The if blocks are only here to adapt to the many possible formats for estimator.predict(X). 
    yhat = estimator.predict(X)
    if len(yhat.shape) == 1: 
        return ((np.array(y[y.columns[0]]) - yhat)**2).mean(0)
    if yhat.shape[1] != 1:
        return (([value*np.ones(yhat.shape[1]) for value in np.array(y[y.columns[0]])] - yhat)**2).mean(0)
    return ((np.array(y[y.columns[0]]) - yhat[:, 0])**2).mean(0)

In [6]:
def MSE_CV(estimator, X, y): 
    #The estimator is an sklearn estimator. It is assumed to be already fit. 
    #y is assumed to be a DataFrame with a single column: the observed response. 
    K = 8
    kfold_cv = sklms.KFold(K, random_state=0, shuffle=True)
    predictions_cv = sklms.cross_val_predict(estimator, X, y, cv=kfold_cv) 
    MSE_cv = []
    for train_idx, test_idx in kfold_cv.split(y):
        MSE_values = ((np.array(y_df[y.columns[0]])[test_idx, None] - predictions_cv[test_idx])**2).mean(0)
        MSE_cv.append(MSE_values) # column means
    return np.array(MSE_cv)