# Partie 1 : Imputation de données manquantes        

###### 1. Importation des librairies

In [30]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import pandas as pd

###### 2. Génération de données bi-dimensionnelles

In [31]:
n_base = 100

data1 = np.random.randn(n_base, 2) + [5, 5]
data2 = np.random.randn(n_base, 2) + [3, 2]
data3 = np.random.randn(n_base, 2) + [1, 5]

data = np.concatenate((data1, data2, data3))

print("Data shape is: ", data.shape)

np.random.shuffle(data)

n_samples = data.shape[0]

Data shape is:  (300, 2)


###### 3. Fabrication d'un jeu de données avec données manquantes

In [19]:
# définition du taux de lignes à valeurs manquantes
missing_rate = 0.3
n_missing_samples = int(np.floor(n_samples * missing_rate))

# choix des lignes à valeurs manquantes
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool)))
np.random.shuffle(missing_samples)

# obtenir la matrice avec données manquates: manque indiqué 
# par valeurs NaN dans la seconde colonne pour les lignes 
# True dans missing_samples

data_missing = data.copy()
data_missing[np.where(missing_samples), 1] = np.nan
print("data_missing shape is: ", data_missing.shape)

data_missing shape is:  (300, 2)


In [20]:
# imputation par la moyenne
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
data_imputed = imp.fit_transform(data_missing)
#print(data_imputed)
print("data_imputed shape is: ", data_imputed.shape)

data_imputed shape is:  (300, 2)


###### Calcul de l'erreur d'imputation

In [21]:
# calculer l'erreur d'imputation
err = mean_squared_error(data[missing_samples, 1], data_imputed[missing_samples, 1])
err

2.8267318502127416

###### 4. Affichage des moyennes calculéss

In [22]:
print("Calculated Mean: ")
print(data_imputed.mean(0)[1])

Calculated Mean: 
3.973430364660325


### les données en DataFrame

In [23]:
pd.DataFrame(data_missing).head(15)

Unnamed: 0,0,1
0,3.631781,1.476879
1,4.972459,
2,0.485158,4.28788
3,0.379828,
4,-0.696175,4.238461
5,4.041259,5.259488
6,4.149452,3.098959
7,5.047256,
8,2.150541,2.775878
9,0.447206,5.816301


In [24]:
pd.DataFrame(data_imputed).head(15)

Unnamed: 0,0,1
0,3.631781,1.476879
1,4.972459,3.97343
2,0.485158,4.28788
3,0.379828,3.97343
4,-0.696175,4.238461
5,4.041259,5.259488
6,4.149452,3.098959
7,5.047256,3.97343
8,2.150541,2.775878
9,0.447206,5.816301


On remarque que c'est la valeur calculée qui a remplacé les NaN. La moyennes sur la variables.

###### 5. Imputation par median

In [25]:
# imputation par la median
imp1 = SimpleImputer(missing_values=np.nan, strategy='median')
data_imputed1 = imp1.fit_transform(data_missing)
#print(data_imputed)
print("data_imputed1 shape is: ", data_imputed1.shape)
# calculer l'erreur d'imputation
err1 = mean_squared_error(data[missing_samples, 1], data_imputed1[missing_samples, 1])
print("Median calculé: ")
print(np.median(data_imputed1, 0)[1])
print("Error: ", err1)

data_imputed1 shape is:  (300, 2)
Median calculé: 
4.317039994506166
Error:  3.021455229883478


In [26]:
pd.DataFrame(data_imputed1).head(15)

Unnamed: 0,0,1
0,3.631781,1.476879
1,4.972459,4.31704
2,0.485158,4.28788
3,0.379828,4.31704
4,-0.696175,4.238461
5,4.041259,5.259488
6,4.149452,3.098959
7,5.047256,4.31704
8,2.150541,2.775878
9,0.447206,5.816301


###### 6. Imputation par une valeur constatnte: zéro par exemple

In [27]:
# imputation par la constant zero
imp2 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
data_imputed2 = imp2.fit_transform(data_missing)
#print(data_imputed)
print("data_imputed2 shape is: ", data_imputed2.shape)
# calculer l'erreur d'imputation
err2 = mean_squared_error(data[missing_samples, 1], data_imputed2[missing_samples, 1])
print("Error: ", err2)

data_imputed2 shape is:  (300, 2)
Error:  17.728448946759915


L'erreur est plus car on prend le carrée de chaque donnée imputé de la variable data sans
retrancher rien, ```data[:, 1] - 0```
![image](images/zero.png)

In [28]:
pd.DataFrame(data_imputed2).head(15)

Unnamed: 0,0,1
0,3.631781,1.476879
1,4.972459,0.0
2,0.485158,4.28788
3,0.379828,0.0
4,-0.696175,4.238461
5,4.041259,5.259488
6,4.149452,3.098959
7,5.047256,0.0
8,2.150541,2.775878
9,0.447206,5.816301


![image](images/ALLDATA.png)

###### 7. Imputation parles k-plus-proches-voisins: k-nearest-neignbors 

La methode KNNImputer nécessaite la version 0.22 de scikit-learn (Calculs réalisé en ligne)

In [1]:
# 7) imputation par kkn 
from sklearn.impute import KNNImputer
knnimp = KNNImputer(n_neighbors=2, weights="uniform")
knn_data_imputed = knnimp.fit_transform(data_missing)
err3 = mean_squared_error(data[missing_samples, 1], knn_data_imputed[missing_samples, 1])
print("KNN imputation: ")
print(np.median(knn_data_imputed, 0))
print("Erreur: ", err3)

ImportError: cannot import name 'KNNImputer' from 'sklearn.impute' (C:\Users\elitebook\Anaconda3\lib\site-packages\sklearn\impute\__init__.py)

![image](images/knn.PNG)


### Conclusion
Lorsqu'on dispose des données manquantes, il vaut mieux remplacer avec des méthodes suivantes:

                mean, median ou KNN