### Confidence Intervals

In [4]:
def confidence_intervals(df, target_name, predicateur_name):
    # import
    import statsmodels.formula.api as smf
    from statsmodels.stats.outliers_influence import summary_table
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    import matplotlib.pyplot as plt
    
    target, predicateur = target_name, predicateur_name
    res = smf.ols(f'{target} ~ {predicateur}', data=df).fit()
    st, data, ss2 = summary_table(res, alpha=0.05)
    
    fitted_value = data[:,2]
    mean_predicted_se = data[:,3]
    mean_predicted_ci_low, mean_predicted_ci_upp = data[:,4:6].T
    predicted_ci_low, predicted_ci_upp = data[:, 6:8].T
    
    predstd, interval_l, interval_u = wls_prediction_std(res)
    
    plt.figure(figsize=(12,10))
    
    ci_low = plt.plot(df[predicateur], predicted_ci_low, 'r--', lw=1, alpha=0.5, label = 'Intervalle de confiance : prediction')
    ci_upper = plt.plot(df[predicateur], predicted_ci_upp, 'r--', lw=1, alpha=0.5)
    mean_ci_lower = plt.plot(df[predicateur], mean_predicted_ci_low, 'b--', lw=1, alpha=0.5, label = 'Intervalle de confiance : mean')
    mean_ci_upper = plt.plot(df[predicateur], mean_predicted_ci_upp, 'b--', lw=1, alpha=0.5)
    
    plt.scatter(df[predicateur], df[target])
    plt.xlabel(predicateur)
    plt.ylabel(target)
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(12,4))
    plt.hist(mean_predicted_se, bins=20)
    plt.title('Standard Error pour les Intervalles de Confiance');

In [5]:
# Residual Squared Error

def RSE(df, y, y_pred):
    RSS = np.sum((y - y_pred)**2)
    RSE = np.sqrt((1 / (len(df) - 2)) * RSS)
    return RSE

In [7]:
# import statsmodels.formula.api as smf
# target, predicateur = target_name, predicateur_name
# res = smf.ols(f'{target} ~ {predicateur}', data=df).fit()

# https://www.real-statistics.com/regression/confidence-and-prediction-intervals/

# create function to compute confidence or prediction interval given an x value

def create_interval(ols_result, interval_type, alpha, x_values, conf_x):
    if interval_type == 'confidence':
        add_one = 0
    elif interval_type == 'prediction':
        add_one = 1
    else:
        print("Choose interval_type as confidence or prediction")
        return
    n = len(x_values)
    t_value = stats.t.ppf(1 - alpha / 2, df = n - 2)
    RSE = np.sqrt((ols_result.resid ** 2).sum() / (n - 2))
    numerator = (conf_x - x_values.mean()) ** 2
    denominator = ((x_values - x_values.mean()) ** 2).sum()
    interval = t_value * RSE * np.sqrt(add_one + 1 / n + numerator / denominator)
    prediction = res.params[0] + res.params[1] * conf_x
    return (prediction - interval, prediction + interval)