## Install and import library


In [1]:
!pip install git+https://github.com/DangLeUyen/DPEImputation.git

Collecting git+https://github.com/DangLeUyen/DPEImputation.git
  Cloning https://github.com/DangLeUyen/DPEImputation.git to /tmp/pip-req-build-fbym2wuz
  Running command git clone --filter=blob:none --quiet https://github.com/DangLeUyen/DPEImputation.git /tmp/pip-req-build-fbym2wuz
  Resolved https://github.com/DangLeUyen/DPEImputation.git to commit f07645d7e09535aba524c98b58db42708d929cc9
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.23.5 (from DPEImputation==0.1.0)
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting scikit-learn==1.4.2 (from DPEImputation==0.1.0)
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadin

In [1]:
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from DPEImputation import DPEImputer

In [2]:
def generate_randomly_missing(X , missing_rate):
    """
    Creates a randomly missing mask for the input data.

    Args:
        data (np.ndarray): The input data.
        missing_rate (float): The ratio of missing values to create.

    Returns:
        np.ndarray: An array with the same shape as `data` where missing values are marked as NaN.
    """
    # Create a copy of the input array to avoid modifying the original data
    X_copy=np.copy(X)
    # Calculate the total number of elements in the array
    total_elements = X_copy.size
    # Determine the number of elements to be replaced with NaN
    num_missing = round(missing_rate * total_elements)
    # Generate random indices where NaN will be introduced
    missing_indices = np.random.randint(0, total_elements, num_missing)
    # Flatten the array to apply NaN values
    X_copy_flat = X_copy.flatten()
    # Assign NaN to the selected indices
    X_copy_flat[missing_indices] = np.nan
    # Reshape the array back to its original shape
    X_nan = X_copy_flat.reshape(X_copy.shape)

    return X_nan

def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

### Generating dataset with label

In [3]:
#create a sample data
X1 = np.random.randint(0, 100, size=(1000, 50)).astype('float64')
# Generate random labels (e.g., binary classification)
y = np.random.randint(0, 2, size=X1.shape[0])

X1 = normalize_data(X1)
#create missingness on data
missing_rate = 0.5
missing_X1 = generate_randomly_missing(X1, missing_rate)


In [5]:
imputer1 = DPEImputer()
start = time.time()

X1_imputed = imputer1.fit(missing_X1, y).transform(missing_X1, y)

rmse1 = mean_squared_error(X1, X1_imputed)
duration1 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration1, rmse1))

Imputation done after: 35.77195453643799 (seconds) and have Rmse = 0.3977018835535527


### Generating dataset without label

In [6]:
#create a sample data
X2 = np.random.randint(0, 100, size=(1000, 50)).astype('float64')

#create missingness on data
X2 = normalize_data(X2)
missing_rate = 0.5
missing_X2 = generate_randomly_missing(X2, missing_rate)


In [7]:
imputer2 = DPEImputer()
start = time.time()
X2_imputed = imputer2.fit_transform(missing_X2)
rmse2 = mean_squared_error(X2, X2_imputed)
duration2 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse2 = {}".format(duration2, rmse2))

Imputation done after: 22.974262237548828 (seconds) and have Rmse2 = 0.39730586019212516


### Using digits dataset (with label)

In [8]:
from sklearn.datasets import load_digits

# Tải dữ liệu digits
digits = load_digits()

# Lấy dữ liệu đặc trưng và nhãn
X = digits.data
y = digits.target
print(X.shape)
rmid = np.where(sum(X!=0)<10)
X = np.delete(X, rmid,axis = 1)
print(X.shape)
X = normalize_data(X)

(1797, 64)
(1797, 54)


In [9]:
digits_missing_data = generate_randomly_missing(X , missing_rate)

start = time.time()
dpei = DPEImputer()
digits_imputed = dpei.fit(digits_missing_data, y, window_size=5).transform(digits_missing_data, y)
rmse3 = mean_squared_error(X, digits_imputed)
duration3 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration3, rmse3))

Imputation done after: 34.36234378814697 (seconds) and have Rmse = 0.19812364106759478
