# Intervalle de prédiction - Random Forest

In [1]:
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
 
boston = load_boston()
X = boston["data"]
y = boston["target"]
n = len(boston["data"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [2]:
np.size(X)

6578

In [3]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [10]:
boston['data']

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [5]:
X_train[0]

array([ 73.5341,   0.    ,  18.1   ,   0.    ,   0.679 ,   5.957 ,
       100.    ,   1.8026,  24.    , 666.    ,  20.2   ,  16.45  ,
        20.62  ])

## Random Forest

In [13]:
# TODO : Entraîner un Random Forest
rf = RandomForestRegressor(max_depth=12, n_estimators=10, min_samples_leaf=5)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [14]:
from sklearn.metrics import r2_score

r2_score(y_test, rf.predict(X_test))

0.8646195018713081

In [15]:
def rf_pred_int(model, X, percentile=95):
    """
    Calcule un intervalle de prédiction empirique à partir des erreurs d'entraînement
    """
    err_down = []
    err_up = []
    # Pour chaque observation
    for i in range(len(X)):
        preds = []
        # TODO : Calculer la prédiction en sortie de chaque arbre du Random Forest
        for estimator in rf.estimators_:
            prediction = estimator.predict([X[i, :]])[0]
            preds.append(prediction)
        # TODO : Calculer les quantiles en fonction du paramètre 'percentile'
        err_down.append(np.percentile(preds, (100 - percentile) / 2))
        err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2))
        
    return err_down, err_up,np.array(preds)

In [16]:
X_test[:2, :]

array([[6.5880e-02, 0.0000e+00, 2.4600e+00, 0.0000e+00, 4.8800e-01,
        7.7650e+00, 8.3300e+01, 2.7410e+00, 3.0000e+00, 1.9300e+02,
        1.7800e+01, 3.9556e+02, 7.5600e+00],
       [6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00]])

In [17]:
rf.predict(X_test[:1, :])

array([40.83455221])

In [11]:
rf_pred_int(rf, X_test[:1, :])


([17.352363636363638],
 [23.830454545454543],
 array([19.98      , 20.38      , 19.7       , 17.48      , 19.375     ,
        19.27      , 21.53076923, 18.5875    , 21.26      , 20.61428571,
        20.77777778, 22.56666667, 19.89285714, 24.83333333, 19.3       ,
        20.73478261, 17.04      , 20.67272727, 21.87692308, 20.14666667,
        20.79285714, 19.33076923, 19.45      , 20.69090909, 22.29      ,
        20.87272727, 18.83636364, 24.03      , 19.1       , 19.54545455,
        17.79285714, 23.09      , 20.5125    , 21.41666667, 20.47333333,
        20.87857143, 18.93333333, 19.7625    , 19.35714286, 18.88461538,
        24.7125    , 19.5       , 22.4       , 19.4125    , 19.6       ,
        18.96363636, 21.4375    , 22.4625    , 18.84      , 19.02      ,
        20.46666667, 20.65714286, 19.35      , 16.1       , 18.60909091,
        23.0375    , 22.07692308, 18.66      , 20.23333333, 18.7625    ,
        21.68181818, 18.26153846, 23.        , 18.76666667, 20.4125    ,
     

In [29]:
err_down, err_up = rf_pred_int(rf, X_train, percentile=95)
 
truth = y_train
correct = 0

# TODO : Calculer la proportion d'observation à l'intérieur de l'intervalle de prédiction empirique
for i, val in enumerate(truth):
    if err_down[i] <= val <= err_up[i]:
        correct += 1

print("Pourcentage d'observations dans l'intervalle : {:2.1f}%".format(correct / len(truth) * 100))

ValueError: too many values to unpack (expected 2)