<a href="https://colab.research.google.com/github/BehrangEbrahimi13/Repo_Paper_01/blob/imputation_methods/Imputation_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
pip install hyperimpute

Collecting hyperimpute
  Downloading hyperimpute-0.1.17-py3-none-any.whl (92 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/92.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m92.2/92.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting catboost>=1.0.5 (from hyperimpute)
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna>=3.1 (from hyperimpute)
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting loguru==0.6.0 (from hyperimpute)
  Downloading loguru-0.6.0-py3-none-any.wh

## List available imputers

In [5]:
from hyperimpute.plugins.imputers import Imputers

imputers = Imputers()

imputers.list

<bound method PluginLoader.list of <hyperimpute.plugins.imputers.Imputers object at 0x7995f96fdbd0>>

## Impute a dataset using one of the available methods

In [15]:
import pandas as pd
import numpy as np
from hyperimpute.plugins.imputers import Imputers

X = pd.DataFrame([[1, 4, 7, 10], [4, 7, np.nan, np.nan], [3, 6, 9, 12], [8, 11, 14, 17]])

method = "gain"

plugin = Imputers().get(method)
out = plugin.fit_transform(X.copy())

print(method, out)

gain      0     1         2          3
0  1.0   4.0   7.00000  10.000000
1  4.0   7.0  10.30363  13.454131
2  3.0   6.0   9.00000  12.000000
3  8.0  11.0  14.00000  17.000000


## Specify the baseline models for HyperImpute

In [13]:
import pandas as pd
import numpy as np
from hyperimpute.plugins.imputers import Imputers

X = pd.DataFrame([[1, 2, 3, 4], [4, 5, np.nan, np.nan], [3, 4, 5, 6], [8, 9, 10, 11]])

plugin = Imputers().get(
    "hyperimpute",
    optimizer="hyperband",
    classifier_seed=["logistic_regression"],
    regression_seed=["linear_regression"],
)

out = plugin.fit_transform(X.copy())
print(out)

   0  1     2     3
0  1  2   3.0   4.0
1  4  5  10.0  11.0
2  3  4   5.0   6.0
3  8  9  10.0  11.0


## Use an imputer with a SKLearn pipeline

In [9]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from hyperimpute.plugins.imputers import Imputers

X = pd.DataFrame([[1, 1, 1, 1], [4, 5, np.nan, np.nan], [3, 3, 9, 9], [2, 2, 2, 2]])
y = pd.Series([1, 2, 1, 2])

imputer = Imputers().get("hyperimpute")

estimator = Pipeline(
    [
        ("imputer", imputer),
        ("forest", RandomForestRegressor(random_state=0, n_estimators=100)),
    ]
)

estimator.fit(X, y)

## Write a new imputation plugin

In [10]:
from sklearn.impute import KNNImputer
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

knn_imputer = "custom_knn"

class KNN(ImputerPlugin):
    def __init__(self) -> None:
        super().__init__()
        self._model = KNNImputer(n_neighbors=2, weights="uniform")

    @staticmethod
    def name():
        return knn_imputer

    @staticmethod
    def hyperparameter_space():
        return []

    def _fit(self, *args, **kwargs):
        self._model.fit(*args, **kwargs)
        return self

    def _transform(self, *args, **kwargs):
        return self._model.transform(*args, **kwargs)

imputers.add(knn_imputer, KNN)

assert imputers.get(knn_imputer) is not None

## Benchmark imputation models on a dataset

In [11]:
from sklearn.datasets import load_iris
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import compare_models

X, y = load_iris(as_frame=True, return_X_y=True)

imputer = Imputers().get("hyperimpute")

compare_models(
    name="example",
    evaluated_model=imputer,
    X_raw=X,
    ref_methods=["ice", "missforest"],
    scenarios=["MAR"],
    miss_pct=[0.1, 0.3],
    n_iter=2,
)

RMSE score


Unnamed: 0,Scenario,"miss_pct [0, 1]",Evaluated: hyperimpute,ice,missforest
0,MAR,0.1,0.0983 +/- 0.0091,0.106 +/- 0.0015,0.1216 +/- 0.0206
1,MAR,0.3,0.1054 +/- 0.0193,0.1053 +/- 0.0192,0.135 +/- 0.0319




Wasserstein score


Unnamed: 0,Scenario,"miss_pct [0, 1]",Evaluated: hyperimpute,ice,missforest
0,MAR,0.1,0.0127 +/- 0.0006,0.0141 +/- 0.0013,0.0173 +/- 0.0029
1,MAR,0.3,0.029 +/- 0.0056,0.0289 +/- 0.0056,0.0475 +/- 0.0118


{'headers': ['Scenario',
  'miss_pct [0, 1]',
  'Evaluated: hyperimpute',
  'ice',
  'missforest'],
 'rmse': [['MAR',
   0.1,
   (0.09830010959970974, 0.009087863539277726),
   (0.10596553338472367, 0.0015358718132642104),
   (0.121627443606942, 0.020623701163109347)],
  ['MAR',
   0.3,
   (0.10535365666931229, 0.019271062034974487),
   (0.10533746002992339, 0.019248616508364152),
   (0.13502500850731694, 0.03194422221595612)]],
 'wasserstein': [['MAR',
   0.1,
   (0.012733922575043782, 0.0006316297750543978),
   (0.014126519159535287, 0.0012984106221138963),
   (0.017300277905512536, 0.002897924284334148)],
  ['MAR',
   0.3,
   (0.02897734339653076, 0.0056122959603456795),
   (0.02894135919269574, 0.005562419900322493),
   (0.0474600482901158, 0.01179883614662304)]]}