# Imputation de données manquantes avec une forêt aléatoire
*Aude Sportisse*

In [1]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
np.random.seed(0)  #fixer la graine (pour reproduire les expériences)

#on créée des données synthétiques (simulées à partir d'un modèle)
n = 1000 
p = 10 
mean = np.repeat(0, p) 
cov = 0.5 * (np.ones((p,p)) + np.eye(p)) 

X_complet = np.random.multivariate_normal(mean, cov, size = n) #c'est le jeu de données sans données manquantes

In [3]:
pd.DataFrame(X_complet).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.651699,-1.577604,-1.010682,-2.534306,-1.942683,-1.044865,-1.116008,-2.467032,-0.625458,-0.112224
1,0.403694,0.36921,-0.868323,-1.207519,-0.175903,-0.560134,0.490021,0.146642,0.176617,0.157438
2,2.473229,1.953623,3.062872,0.523392,0.899866,2.893027,2.665302,1.68379,1.231245,1.547132
3,-0.710457,-0.086647,-0.385689,-0.591911,0.641094,-0.270124,-0.07107,1.318722,-0.996541,0.003502
4,-0.366973,0.394682,0.872166,2.017242,1.227314,1.143759,-0.366388,-0.207854,1.17743,1.884901


In [4]:
pourcentage_NA = 0.5 #50% de données manquantes
X_manquant = np.copy(X_complet)
for j in range(p):
    pattern_NA = np.random.choice(n, np.floor(n*pourcentage_NA).astype(int), replace=False)
    X_manquant[pattern_NA,j] = np.nan
Mask = np.isfinite(X_manquant) #matrice indiquant où sont les données manquantes dans X

In [5]:
pd.DataFrame(X_manquant).head().style.highlight_null(color='orange')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.651699,,-1.010682,-2.534306,-1.942683,-1.044865,,-2.467032,,-0.112224
1,0.403694,0.36921,,,,,0.490021,,,0.157438
2,2.473229,1.953623,,0.523392,,,,,1.231245,
3,,-0.086647,,,,-0.270124,,1.318722,,
4,,0.394682,,,,1.143759,,,,1.884901


## Imputer en utilisant la moyenne

In [6]:
X_mean = SimpleImputer().fit_transform(X_manquant)

In [8]:
def color_imputedvalues_orange(val, x_miss):
    color = 'orange' if val not in x_miss else 'None'
    return 'background-color: %s' % color

pd.DataFrame(X_mean).head().style.applymap(color_imputedvalues_orange, x_miss=X_manquant)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.651699,0.020219,-1.010682,-2.534306,-1.942683,-1.044865,0.011121,-2.467032,0.065752,-0.112224
1,0.403694,0.36921,0.089663,-0.002638,0.02546,-0.009188,0.490021,0.046442,0.065752,0.157438
2,2.473229,1.953623,0.089663,0.523392,0.02546,-0.009188,0.011121,0.046442,1.231245,-0.033378
3,-0.004717,-0.086647,0.089663,-0.002638,0.02546,-0.270124,0.011121,1.318722,0.065752,-0.033378
4,-0.004717,0.394682,0.089663,-0.002638,0.02546,1.143759,0.011121,0.046442,0.065752,1.884901


## Imputer avec une forêts aléatoires

In [9]:
estimateur_rf = ExtraTreesRegressor(n_estimators=10, random_state=0) #max_depth
X_rf = IterativeImputer(estimator=estimateur_rf, random_state=0, max_iter=50).fit_transform(X_manquant)



In [10]:
pd.DataFrame(X_rf).head().style.applymap(color_imputedvalues_orange, x_miss=X_manquant)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.651699,-1.025556,-1.010682,-2.534306,-1.942683,-1.044865,-0.634891,-2.467032,-1.305865,-0.112224
1,0.403694,0.36921,0.080302,0.15879,-0.099317,-0.069162,0.490021,0.405483,0.459631,0.157438
2,2.473229,1.953623,0.802843,0.523392,1.28505,1.437487,2.277692,0.983895,1.231245,0.445855
3,0.627407,-0.086647,0.292449,-0.256355,0.126218,-0.270124,0.465121,1.318722,0.282923,0.458103
4,0.381908,0.394682,1.375845,0.934248,0.935775,1.143759,0.251838,0.861533,0.332347,1.884901


## Comparaison des deux méthodes

In [11]:
def RMSE(X, X_true, mask):
    mask_ = (1-mask).astype(bool)
    return np.sqrt(((X[mask_] - X_true[mask_])**2).sum() / mask_.sum())

In [12]:
RMSE(X_rf, X_complet, Mask)

0.8611080489295024

In [13]:
RMSE(X_mean, X_complet, Mask)

0.9893929064810191