## Install and import library


In [2]:
!pip install git+https://github.com/DangLeUyen/DPEImputation.git

Collecting git+https://github.com/DangLeUyen/DPEImputation.git
  Cloning https://github.com/DangLeUyen/DPEImputation.git to /private/var/folders/42/h0csfrkn2fvfq63027xhz7d40000gn/T/pip-req-build-q7ob8ddg
  Running command git clone --filter=blob:none --quiet https://github.com/DangLeUyen/DPEImputation.git /private/var/folders/42/h0csfrkn2fvfq63027xhz7d40000gn/T/pip-req-build-q7ob8ddg
  Resolved https://github.com/DangLeUyen/DPEImputation.git to commit eded69d53263a6e155b32c30d48b2f7329459fc3
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: DPEImputation
  Building wheel for DPEImputation (setup.py) ... [?25ldone
[?25h  Created wheel for DPEImputation: filename=DPEImputation-0.1.0-py3-none-any.whl size=6386 sha256=edeed749abc86d9fde58145f538a0ae866427d0218072737c29f5eb846991a5d
  Stored in directory: /private/var/folders/42/h0csfrkn2fvfq63027xhz7d40000gn/T/pip-ephem-wheel-cache-2208plv4/wheels/ea/b7/79/1faeb14dc67afa9acbaa5079fd4e482ee9c17a596e9ba

In [3]:
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [5]:
from DPEImputation import DPEImputer


In [6]:
def generate_randomly_missing(X , missing_rate):
    """
    Creates a randomly missing mask for the input data.

    Args:
        data (np.ndarray): The input data.
        missing_rate (float): The ratio of missing values to create.

    Returns:
        np.ndarray: An array with the same shape as `data` where missing values are marked as NaN.
    """
    
    non_missing = [0]
    X_copy=np.copy(X)
    
    X_non_missing_col = X_copy[:, non_missing]
    X1_missing = X_copy[:, [i for i in range(X.shape[1]) if i not in non_missing]]

    X_non_missing_row = X1_missing[non_missing]
    X_missing = X1_missing[len(non_missing):(X.shape[0]+1)]
    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)

    X1_nan = np.vstack((X_non_missing_row, X_nan))
    X_nan = np.hstack((X_non_missing_col, X1_nan))
    
    return X_nan

def normalize_data(X):
  scaler = StandardScaler()
  scaler.fit(X)
  return scaler.transform(X)

### Generating dataset with label

In [18]:
#create a sample data
X1 = np.random.randint(0, 100, size=(1000, 50)).astype('float64')
# Generate random labels (e.g., binary classification)
y = np.random.randint(0, 2, size=X1.shape[0])

X1 = normalize_data(X1)
#create missingness on data
missing_rate = 0.5
missing_X1 = generate_randomly_missing(X1, missing_rate)


In [19]:
imputer1 = DPEImputer()
start = time.time()

X1_imputed = imputer1.fit(missing_X1, y, window_size=7).transform(missing_X1, y)

rmse1 = mean_squared_error(X1, X1_imputed)
duration1 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration1, rmse1))

Imputation done after: 12.21154522895813 (seconds) and have Rmse = 0.39080723212150814


### Generating dataset without label

In [22]:
#create a sample data
X2 = np.random.randint(0, 100, size=(1000, 50)).astype('float64')

#create missingness on data
X2 = normalize_data(X2)
missing_rate = 0.5
missing_X2 = generate_randomly_missing(X2, missing_rate)


In [23]:
imputer2 = DPEImputer()
start = time.time()
X2_imputed = imputer2.fit(missing_X2, window_size=5).transform(missing_X2)
rmse2 = mean_squared_error(X2, X2_imputed)
duration2 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse2 = {}".format(duration2, rmse2))

Imputation done after: 8.763720989227295 (seconds) and have Rmse2 = 0.38951171266514684


### Using digits dataset (with label)

In [24]:
from sklearn.datasets import load_digits

# Tải dữ liệu digits
digits = load_digits()

# Lấy dữ liệu đặc trưng và nhãn
X = digits.data
y = digits.target
print(X.shape)
rmid = np.where(sum(X!=0)<10)
X = np.delete(X, rmid,axis = 1)
print(X.shape)
X = normalize_data(X)

(1797, 64)
(1797, 54)


In [26]:
digits_missing_data = generate_randomly_missing(X , missing_rate)

start = time.time()
dpei = DPEImputer()
digits_imputed = dpei.fit(digits_missing_data, y, window_size=5).transform(digits_missing_data, y)
rmse3 = mean_squared_error(X, digits_imputed)
duration3 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration3, rmse3))

Imputation done after: 18.910322189331055 (seconds) and have Rmse = 0.19511689582472674
