<a href="https://colab.research.google.com/github/BehrangEbrahimi13/Repo_Paper_01/blob/imputation_methods/Paper_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions

## Random Null Generation for a Dataset

In [1]:
import random
import numpy as np
import pandas as pd

def generate_random_array(shape, low, high, round, seed=None, as_dataframe=False):
    np.random.seed(seed)
    random_array = np.random.uniform(low, high, size=(shape)).round(round)
    if as_dataframe:
        return pd.DataFrame(random_array)
    return random_array

def generate_random_nulls(dataset, percentage, seed=None, as_dataframe=False):
    temp = dataset.copy()
    np.random.seed(seed)
    null_mask_indices = np.random.choice(range(temp.size), size=int(temp.size * percentage), replace=False)
    if as_dataframe:
        df_null_mask = pd.DataFrame(False, index=temp.index, columns=temp.columns)
        df_null_mask.values.flat[null_mask_indices] = True
        df_masked = temp.where(~df_null_mask)
        return df_masked
    temp.ravel()[null_mask_indices] = np.nan
    return temp

# Module 1 : PMIC
Generally, feature selection does not work on missing data, so imputation is needed beforehand. However, considering irrelevant features severely affect imputation, we use MIC to select features on missing data by ‘‘partial sample strategy’’ (PSS), which is called **PMIC**. ‘‘Partial sample strategy’’ means using the available values of all feature variables and class variable to calculate MIC

In [None]:
!pip install minepy

In [None]:
import numpy as np
from minepy import MINE

mine = MINE()
def pmic_feature_selection(F, C, m):
    num_features = F.shape[1]
    pmic_scores = np.zeros(num_features)

    for i in range(num_features):
        musk = ~np.isnan(F[:, i])
        # Select column i without null values
        feature_without_null = F[musk, i]

        # Filter y based on non-null values in column i
        class_without_null = C[musk]

        # Calculate the MIC (Maximal Information Coefficient) score for the current feature and class
        mine.compute_score(feature_without_null, class_without_null)
        pmic_scores[i] = mine.mic()

    # Sort the indices of pmic_scores in descending order and select the top m indices
    top_m_features_idx = np.argsort(pmic_scores)[::-1][:m]
    return top_m_features_idx

## Example Module 1 with Functions

In [None]:
seed = 48
percentage = 0.5
select_top = 3
row = 5

# Generate a random 5x4 array with one-digit random values (0-9)
X = generate_random_array(shape=(row, 4), low=0, high=10, round=0, seed=seed, as_dataframe=False)
print("complete X : \n", X)

# Generate a random 5x1 array with one-digit random values (0-9)
Y = generate_random_array((row,), 0, 10, 0, seed=48, as_dataframe=False)
print("\ny: \n", Y)

X_with_null = generate_random_nulls(dataset=X, percentage=percentage, seed=seed, as_dataframe=False)
selected_idx = pmic_feature_selection(X_with_null, Y, select_top)
selected_features = X_with_null[:, selected_idx]

print("\nX_with_null: \n", X_with_null)
print(f"\nSelected {select_top} indices of features ({percentage * 100} percent null):\n", selected_idx)
print(f"\nSelected {select_top} features ({percentage * 100} percent null):\n", selected_features)

# Module 2 : Imputation for the missing data

## Non-negative Latent Factor

Description:
*   R: Incomplete data matrix of shape (n, m).
*   d: Rank of the non-negative latent factors.
*   lambda1, lambda2: Regularization parameters.
*   max_iter: Maximum number of iterations.








In [3]:
import numpy as np

def Imputes_the_missing_values_By_non_negative_latent_factor(R, d, lambda1, lambda2, max_iter):
    # Initialize non-negative matrix P randomly
    n, m = R.shape
    R_copy = np.copy(R)
    P = np.random.rand(n, d)

    # Initialize non-negative matrix Q randomly
    Q = np.random.rand(d, m)

    # Initialize I according to (17)
    I = np.ones((n, m))
    R_nan_mask = np.isnan(R_copy)
    I[R_nan_mask] = 0

    # Set zero for Null
    R_copy[R_nan_mask] = 0

    # Initialize iteration counter
    iter = 0

    # Convergence criterion
    converge = False

    while not converge and iter < max_iter:
        # Update P according to (22)
        P_new = P * ((I * R_copy) @ Q.T) / ((I * (P @ Q)) @ Q.T + lambda1 * P)

        # Update Q according to (23)
        Q_new = Q * (P_new.T @ (I * R_copy)) / (P_new.T @ (I * (P_new @ Q)) + lambda2 * Q)

        # Check convergence
        if np.allclose(P, P_new) and np.allclose(Q, Q_new):
            converge = True

        # Update P and Q
        P = P_new
        Q = Q_new

        # Increment iteration counter
        iter += 1

    # Impute R by (11) and obtain R_cpl
    PQ = np.round( P @ Q, decimals=3)
    R_cpl = np.where(R_nan_mask, PQ, R)

    return R_cpl


## Hyperimpute

In [None]:
!pip install hyperimpute

In [10]:
import pandas as pd
import numpy as np
from hyperimpute.plugins.imputers import Imputers
imputers = Imputers()

X = pd.DataFrame([[1, 4, 7, 10], [4, 7, np.nan, np.nan], [3, 6, 9, 12], [8, 11, 14, 17]])

method = "gain"

plugin = Imputers().get(method)
out = plugin.fit_transform(X.copy()).round(2)

print(method, out)

gain      0     1     2      3
0  1.0   4.0   7.0  10.00
1  4.0   7.0  10.3  13.45
2  3.0   6.0   9.0  12.00
3  8.0  11.0  14.0  17.00


## Example Module 2 Non-negative Latent Factor and GAIN with Functions

In [11]:
seed = 48
round = 2
percentage = 0.3
row = 5

# Generate a random 5x4 array with one-digit random values (0-9)
X = generate_random_array(shape=(row, 4), low=0, high=10, round=round, seed=seed, as_dataframe=False)
print("complete X : \n", X)

X_with_null = generate_random_nulls(dataset=X, percentage=percentage, seed=seed, as_dataframe=False)
print("\nX_with_null : \n", X_with_null)

R = np.copy(X_with_null)
d = 2
lambda1 = 0.1
lambda2 = 0.2
max_iter = 100

# Code execution
R_cpl = Imputes_the_missing_values_By_non_negative_latent_factor(R, d, lambda1, lambda2, max_iter)
print("\nComplete data after imputation by non_negative_latent_factor: \n", R_cpl)


method = "gain"
plugin = Imputers().get(method)
out = plugin.fit_transform(R.copy()).round(round)

print(f'\nComplete data after imputation by {method}: \n', out)

complete X : 
 [[0.17 8.92 2.85 2.99]
 [7.92 3.24 8.65 4.48]
 [5.48 3.57 1.12 1.42]
 [4.45 7.32 4.6  5.93]
 [3.37 4.54 1.87 4.09]]

X_with_null : 
 [[0.17  nan 2.85 2.99]
 [7.92  nan 8.65  nan]
 [ nan  nan 1.12 1.42]
 [ nan 7.32 4.6  5.93]
 [3.37 4.54 1.87 4.09]]

Complete data after imputation by non_negative_latent_factor: 
 [[ 0.17   0.235  2.85   2.99 ]
 [ 7.92   9.428  8.65  10.738]
 [ 0.906  1.078  1.12   1.42 ]
 [ 5.593  7.32   4.6    5.93 ]
 [ 3.37   4.54   1.87   4.09 ]]

Complete data after imputation by gain: 
       0     1     2     3
0  0.17  4.57  2.85  2.99
1  7.92  5.84  8.65  4.05
2  1.29  4.57  1.12  1.42
3  3.75  7.32  4.60  5.93
4  3.37  4.54  1.87  4.09
