<a href="https://colab.research.google.com/github/BehrangEbrahimi13/Repo_Paper_01/blob/v2.1.0%23Implementation/Paper_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions

## Random Null Generation for a Dataset

In [57]:
import random
import numpy as np
import pandas as pd

def generate_random_array(shape, low, high, round, seed=None, as_dataframe=False):
    np.random.seed(seed)
    random_array = np.random.uniform(low, high, size=(shape)).round(round)
    if as_dataframe:
        return pd.DataFrame(random_array)
    return random_array

def generate_random_nulls(dataset, percentage, seed=None, as_dataframe=False):
    temp = dataset.copy()
    np.random.seed(seed)
    null_mask_indices = np.random.choice(range(temp.size), size=int(temp.size * percentage), replace=False)
    if as_dataframe:
        df_null_mask = pd.DataFrame(False, index=temp.index, columns=temp.columns)
        df_null_mask.values.flat[null_mask_indices] = True
        df_masked = temp.where(~df_null_mask)
        return df_masked
    temp.ravel()[null_mask_indices] = np.nan
    return temp

# Module 1 :
Generally, feature selection does not work on missing data, so imputation is needed beforehand. However, considering irrelevant features severely affect imputation, we use MIC to select features on missing data by ‘‘partial sample strategy’’ (PSS), which is called **PMIC**. ‘‘Partial sample strategy’’ means using the available values of all feature variables and class variable to calculate MIC

In [None]:
!pip install minepy

In [58]:
import numpy as np
from minepy import MINE

mine = MINE()
def pmic_feature_selection(F, C, m):
    num_features = F.shape[1]
    pmic_scores = np.zeros(num_features)

    for i in range(num_features):
        musk = ~np.isnan(F[:, i])
        # Select column i without null values
        feature_without_null = F[musk, i]

        # Filter y based on non-null values in column i
        class_without_null = C[musk]

        # Calculate the MIC (Maximal Information Coefficient) score for the current feature and class
        mine.compute_score(feature_without_null, class_without_null)
        pmic_scores[i] = mine.mic()

    # Sort the indices of pmic_scores in descending order and select the top m indices
    top_m_features_idx = np.argsort(pmic_scores)[::-1][:m]
    return top_m_features_idx

In [None]:
seed = 48
percentage = 0.5
select_top = 3
row = 5

# Generate a random 5x4 array with one-digit random values (0-9)
X = generate_random_array(shape=(row, 4), low=0, high=10, round=0, seed=seed, as_dataframe=False)
print("complete X : \n", X)

# Generate a random 5x1 array with one-digit random values (0-9)
Y = generate_random_array((row,), 0, 10, 0, seed=48, as_dataframe=False)
print("\ny: \n", Y)

X_with_null = generate_random_nulls(dataset=X, percentage=percentage, seed=seed, as_dataframe=False)
selected_idx = pmic_feature_selection(X_with_null, Y, select_top)
selected_features = X_with_null[:, selected_idx]

print("\nX_with_null: \n", X_with_null)
print(f"\nSelected {select_top} indices of features ({percentage * 100} percent null):\n", selected_idx)
print(f"\nSelected {select_top} features ({percentage * 100} percent null):\n", selected_features)