<a href="https://colab.research.google.com/github/BehrangEbrahimi13/Repo_Paper_01/blob/%2303-Implementation-Feature-Screening/Paper_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions

## Generate synthetic data

In [None]:
import numpy as np

def generate_synthetic_data(num_samples, num_class_dependent_features, num_class_independent_features, noise, low, high, round, seed):
  np.random.seed(seed)

  # Generate random class values according to a desired distribution
  class_data = np.random.uniform(low, high, size=num_samples).round(round)

  # Initialize an empty array to hold the feature data
  class_dependent_feature_data = np.zeros((num_samples, num_class_dependent_features))

  # Generate random values for each feature independently
  for i in range(num_class_dependent_features):
      class_dependent_feature_data[:, i] = np.random.uniform(low, high, size=num_samples).round(round)

  # Modify the feature values based on their relationship with the class
  for i in range(num_class_dependent_features):
      class_dependent_feature_data[:, i] += class_data * (i + 1) # You can multiply the class_data by a scaling factor to control the relationship strength

  # Create a linear combination of the last two features and add to the rest of the features
  dependent_column = np.zeros((num_class_dependent_features, 1))
  dependent_column[num_class_dependent_features-2:, 0] = 1
  new_dependent_feature = np.dot(class_dependent_feature_data, dependent_column).round(round)
  feature_data = np.column_stack((class_dependent_feature_data, new_dependent_feature))

  # Generate random independent features
  class_independent_features = np.random.rand(num_samples, num_class_independent_features)

  # Optional: Add some noise to the features to make them more diverse
  class_independent_features = (class_independent_features + np.random.normal(0, noise, class_independent_features.shape)).round(round)

  # Merge the independent features from class and feature data into a single feature_data
  feature_data = np.column_stack((feature_data, class_independent_features))

  # Merge the class and feature data into a single dataset
  # dataset = np.column_stack((class_data, feature_data))

  return class_data, feature_data


## Random Null Generation for a Dataset

In [None]:
import random
import numpy as np
import pandas as pd

def generate_random_array(shape, low, high, round, seed=None, as_dataframe=False):
    np.random.seed(seed)
    random_array = np.random.uniform(low, high, size=(shape)).round(round)
    if as_dataframe:
        return pd.DataFrame(random_array)
    return random_array

def generate_random_nulls(dataset, percentage, seed=None, as_dataframe=False):
    temp = dataset.copy()
    np.random.seed(seed)
    null_mask_indices = np.random.choice(range(temp.size), size=int(temp.size * percentage), replace=False)
    # missing_mask = np.random.rand(n_samples, n_features) < 0.2
    if as_dataframe:
        df_null_mask = pd.DataFrame(False, index=temp.index, columns=temp.columns)
        df_null_mask.values.flat[null_mask_indices] = True
        df_masked = temp.where(~df_null_mask)
        return df_masked
    temp.ravel()[null_mask_indices] = np.nan
    return temp

# Module 1 : PMIC
Generally, feature selection does not work on missing data, so imputation is needed beforehand. However, considering irrelevant features severely affect imputation, we use MIC to select features on missing data by ‘‘partial sample strategy’’ (PSS), which is called **PMIC**. ‘‘Partial sample strategy’’ means using the available values of all feature variables and class variable to calculate MIC

In [None]:
!pip install minepy

Collecting minepy
  Downloading minepy-1.2.6.tar.gz (496 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/497.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/497.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.0/497.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: minepy
  Building wheel for minepy (setup.py) ... [?25l[?25hdone
  Created wheel for minepy: filename=minepy-1.2.6-cp310-cp310-linux_x86_64.whl size=187016 sha256=0f67c43bb0870bd7b0801801e1ee37d5384cc0d26f83dd37078ef3720bfdb4b7
  Stored in directory: /root/.cache/pip/wheels/69/38/a6/825bb9b9ed81e6af43a0ef80c7cfe4cafcfdbc2f5cde2959d9
Successfully built minepy
Installing collected packages: minepy
Successfully installed minepy-1.2.6


In [None]:
import numpy as np
from minepy import MINE

mine = MINE()
def pmic_feature_selection(F, C, m):
    num_features = F.shape[1]
    pmic_scores = np.zeros(num_features)

    for i in range(num_features):
        musk = ~np.isnan(F[:, i])
        # Select column i without null values
        feature_without_null = F[musk, i]

        # Filter y based on non-null values in column i
        class_without_null = C[musk]

        # Calculate the MIC (Maximal Information Coefficient) score for the current feature and class
        mine.compute_score(feature_without_null, class_without_null)
        pmic_scores[i] = mine.mic()

    # Sort the indices of pmic_scores in descending order and select the top m indices
    top_m_features_idx = np.argsort(pmic_scores)[::-1][:m]
    return top_m_features_idx

### Example Module 1 with synthetic data

In [None]:
num_samples = 1000
num_class_dependent_features = 6
num_class_independent_features = 3
low = 0
high = 1
round = 2
noise = 0.1
seed = 42

class_data, feature_data = generate_synthetic_data(num_samples, num_class_dependent_features, num_class_independent_features, noise, low, high, round, seed)

selected_idx = pmic_feature_selection(feature_data, class_data, num_class_dependent_features)
selected_features = feature_data[:, selected_idx]

# print("\nfeature_data: \n", feature_data)
print(f"\nSelected {num_class_dependent_features} indices of features :\n", selected_idx)
print(f"\nSelected {num_class_dependent_features} features :\n", selected_features)


Selected 6 indices of features :
 [6 5 4 3 2 1]

Selected 6 features :
 [[ 5.11  2.87  2.24  2.05  1.78  1.  ]
 [11.09  5.87  5.22  4.61  3.65  2.15]
 [ 9.75  5.25  4.5   3.68  2.44  2.37]
 ...
 [ 3.22  1.75  1.47  0.89  0.81  0.59]
 [11.29  6.05  5.24  4.62  3.38  2.19]
 [ 5.72  3.32  2.4   2.4   1.51  1.77]]


## Example Module 1 with Functions

In [None]:
seed = 48
percentage = 0.5
select_top = 3
row = 5

# Generate a random 5x4 array with one-digit random values (0-9)
X = generate_random_array(shape=(row, 4), low=0, high=10, round=0, seed=seed, as_dataframe=False)
print("complete X : \n", X)

# Generate a random 5x1 array with one-digit random values (0-9)
Y = generate_random_array((row,), 0, 10, 0, seed=48, as_dataframe=False)
print("\ny: \n", Y)

X_with_null = generate_random_nulls(dataset=X, percentage=percentage, seed=seed, as_dataframe=False)
selected_idx = pmic_feature_selection(X_with_null, Y, select_top)
selected_features = X_with_null[:, selected_idx]

print("\nX_with_null: \n", X_with_null)
print(f"\nSelected {select_top} indices of features ({percentage * 100} percent null):\n", selected_idx)
print(f"\nSelected {select_top} features ({percentage * 100} percent null):\n", selected_features)

# Module 2 : Imputation for the missing data

## Non-negative Latent Factor

Description:
*   R: Incomplete data matrix of shape (n, m).
*   d: Rank of the non-negative latent factors.
*   lambda1, lambda2: Regularization parameters.
*   max_iter: Maximum number of iterations.








In [None]:
import numpy as np

def Imputes_the_missing_values_By_non_negative_latent_factor(R, d, lambda1, lambda2, max_iter):
    # Initialize non-negative matrix P randomly
    n, m = R.shape
    R_copy = np.copy(R)
    P = np.random.rand(n, d)

    # Initialize non-negative matrix Q randomly
    Q = np.random.rand(d, m)

    # Initialize I according to (17)
    I = np.ones((n, m))
    R_nan_mask = np.isnan(R_copy)
    I[R_nan_mask] = 0

    # Set zero for Null
    R_copy[R_nan_mask] = 0

    # Initialize iteration counter
    iter = 0

    # Convergence criterion
    converge = False

    while not converge and iter < max_iter:
        # Update P according to (22)
        P_new = P * ((I * R_copy) @ Q.T) / ((I * (P @ Q)) @ Q.T + lambda1 * P)

        # Update Q according to (23)
        Q_new = Q * (P_new.T @ (I * R_copy)) / (P_new.T @ (I * (P_new @ Q)) + lambda2 * Q)

        # Check convergence
        if np.allclose(P, P_new) and np.allclose(Q, Q_new):
            converge = True

        # Update P and Q
        P = P_new
        Q = Q_new

        # Increment iteration counter
        iter += 1

    # Impute R by (11) and obtain R_cpl
    PQ = np.round( P @ Q, decimals=3)
    R_cpl = np.where(R_nan_mask, PQ, R)

    return R_cpl


## Hyperimpute

In [None]:
!pip install hyperimpute

In [None]:
import pandas as pd
import numpy as np
from hyperimpute.plugins.imputers import Imputers
imputers = Imputers()

X = pd.DataFrame([[1, 4, 7, 10], [4, 7, np.nan, np.nan], [3, 6, 9, 12], [8, 11, 14, 17]])

method = "gain"

plugin = Imputers().get(method)
out = plugin.fit_transform(X.copy()).round(2)

print(method, out)

gain      0     1     2      3
0  1.0   4.0   7.0  10.00
1  4.0   7.0  10.3  13.45
2  3.0   6.0   9.0  12.00
3  8.0  11.0  14.0  17.00


## Example Module 2 Non-negative Latent Factor and GAIN with Functions

In [None]:
from hyperimpute.plugins.imputers import Imputers

seed = 48
round = 2
percentage = 0.3
row = 5

# Generate a random 5x4 array with one-digit random values (0-9)
X = generate_random_array(shape=(row, 4), low=0, high=10, round=round, seed=seed, as_dataframe=False)
print("complete X : \n", X)

X_with_null = generate_random_nulls(dataset=X, percentage=percentage, seed=seed, as_dataframe=False)
print("\nX_with_null : \n", X_with_null)

R = np.copy(X_with_null)
d = 2
lambda1 = 0.1
lambda2 = 0.2
max_iter = 100

# Code execution
R_cpl = Imputes_the_missing_values_By_non_negative_latent_factor(R, d, lambda1, lambda2, max_iter)
print("\nComplete data after imputation by non_negative_latent_factor: \n", R_cpl)


method = "gain"
plugin = Imputers().get(method)
out = plugin.fit_transform(R.copy()).round(round)

print(f'\nComplete data after imputation by {method}: \n', out)

complete X : 
 [[0.17 8.92 2.85 2.99]
 [7.92 3.24 8.65 4.48]
 [5.48 3.57 1.12 1.42]
 [4.45 7.32 4.6  5.93]
 [3.37 4.54 1.87 4.09]]

X_with_null : 
 [[0.17  nan 2.85 2.99]
 [7.92  nan 8.65  nan]
 [ nan  nan 1.12 1.42]
 [ nan 7.32 4.6  5.93]
 [3.37 4.54 1.87 4.09]]

Complete data after imputation by non_negative_latent_factor: 
 [[ 0.17   0.235  2.85   2.99 ]
 [ 7.92   9.428  8.65  10.738]
 [ 0.906  1.078  1.12   1.42 ]
 [ 5.593  7.32   4.6    5.93 ]
 [ 3.37   4.54   1.87   4.09 ]]

Complete data after imputation by gain: 
       0     1     2     3
0  0.17  4.57  2.85  2.99
1  7.92  5.84  8.65  4.05
2  1.29  4.57  1.12  1.42
3  3.75  7.32  4.60  5.93
4  3.37  4.54  1.87  4.09


## Example Module 2 with synthetic data

In [None]:
!pip install fancyimpute



In [None]:
from hyperimpute.plugins.imputers import Imputers
imputers = Imputers()

imputers.list()

num_samples = 500
num_class_dependent_features = 100
num_class_independent_features = 50
low = 0
high = 1
round = 2
noise = 0.1
seed = 42
percentage = 0.3

class_data, feature_data = generate_synthetic_data(num_samples, num_class_dependent_features, num_class_independent_features, noise, low, high, round, seed)
print ("max : ", max(map(max, feature_data)))

# Normalize feature_data data between 0 and 1 for each column
# normalized_data = np.zeros_like(feature_data)

# for i in range(feature_data.shape[1]):
#     column = feature_data[:, i]
#     min_val = np.min(column)
#     max_val = np.max(column)
#     if max_val - min_val == 0:
#         normalized_column = np.zeros_like(column)
#     else:
#         normalized_column = (column - min_val) / (max_val - min_val)
#     normalized_data[:, i] = normalized_column



# print("\nFeature_data : \n", feature_data)

feature_data_with_null = generate_random_nulls(dataset=feature_data, percentage=percentage, seed=seed, as_dataframe=False)
# print("\nfeature_data_with_null : \n", feature_data_with_null)

# selected_idx = pmic_feature_selection(feature_data, class_data, num_class_dependent_features)
# selected_features = feature_data[:, selected_idx]
R = np.copy(feature_data_with_null)
d = 2
lambda1 = 0.1
lambda2 = 0.2
max_iter = 100

# Code execution
R_cpl = Imputes_the_missing_values_By_non_negative_latent_factor(R, d, lambda1, lambda2, max_iter)
# print("\nComplete data after imputation by non_negative_latent_factor: \n", R_cpl)

# Implement "gain" using hyperimpute
method = "gain"
plugin = Imputers().get(method)
gain_out = plugin.fit_transform(R.copy()).round(round)

# method = "miwae"
# plugin = Imputers().get("miwae")
# miwae_out = plugin.fit_transform(R.copy())


# print(f'\nComplete data after imputation by {method}: \n', gain_out)

# Implement MICE using fancyimpute
mice_imputer = IterativeImputer()
X_imputed_mice = mice_imputer.fit_transform(feature_data_with_null)


# Error calculation:

# For all these error metrics, smaller values are generally considered better,
# except for the R-squared metric where higher values indicate a better fit.
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import r2_score

# Mean Absolute Error (MAE): It is the average absolute difference between corresponding elements of the two matrices.
mae_non_negative_latent_factor = mean_absolute_error(feature_data, R_cpl)
print("\n mae error for non_negative_latent_factor method : ",mae_non_negative_latent_factor)
mae_mice = mean_absolute_error(feature_data, X_imputed_mice)
print("\n mae error for mice method : ",mae_mice)
mae_gain = mean_absolute_error(feature_data, gain_out)
print("\n mae error for gain method : ",mae_gain)

# Mean Squared Error (MSE): It is the average of the squared differences between corresponding elements of the two matrices.
mse_non_negative_latent_factor = mean_squared_error(feature_data.flatten(), R_cpl.flatten())
print("\n mse error for non_negative_latent_factor method : ",mse_non_negative_latent_factor)
mse_mice = mean_squared_error(feature_data.flatten(), X_imputed_mice.flatten())
print("\n mse error for mice method : ",mse_mice)
mse_gain = mean_squared_error(feature_data.flatten(), gain_out.to_numpy().flatten())
print("\n mse error for gain method : ",mse_gain)

# Root Mean Squared Error (RMSE): It is the square root of the MSE.
rmse_non_negative_latent_factor = math.sqrt(mse_non_negative_latent_factor)
print("\n rmse error for non_negative_latent_factor method : ",rmse_non_negative_latent_factor)
rmse_mice = math.sqrt(mse_mice)
print("\n rmse error for mice method : ",rmse_mice)
rmse_gain = math.sqrt(mse_gain)
print("\n rmse error for gain method : ",rmse_gain)

# R-squared (Coefficient of Determination): It is a statistical measure that indicates the proportion of the variance in the dependent variable
# that is predictable from the independent variable(s).
r2_non_negative_latent_factor = r2_score(feature_data.flatten(), R_cpl.flatten())
print("\n r2 error for non_negative_latent_factor method : ",r2_non_negative_latent_factor)
r2_mice = r2_score(feature_data.flatten(), X_imputed_mice.flatten())
print("\n r2 error for mice method : ",r2_mice)
r2_gain = r2_score(feature_data.flatten(), gain_out.to_numpy().flatten())
print("\n r2 error for gain method : ",r2_gain)


max :  198.74

 mae error for non_negative_latent_factor method :  0.08830417218543046

 mae error for mice method :  0.07670262870703903

 mae error for gain method :  0.18749721854304674

 mse error for non_negative_latent_factor method :  0.03923348282119206

 mse error for mice method :  0.02713880200306323

 mse error for gain method :  0.254268498013245

 rmse error for non_negative_latent_factor method :  0.19807443757636184

 rmse error for mice method :  0.16473858686738585

 rmse error for gain method :  0.5042504318423982

 r2 error for non_negative_latent_factor method :  0.999928075667905

 r2 error for mice method :  0.9999502481027028

 r2 error for gain method :  0.9995338651942841


# Module 3 : Select Featurs

## Step 1: Dimension Reduction and Feature Extraction


In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

# Encoder Layer Class
class EncoderLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLayer, self).__init__()
        self.fc = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        out = self.fc(x)
        out = self.activation(out)
        return out

# Autoencoder Class
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes=None):
        super(Autoencoder, self).__init__()
        self.encoder = EncoderLayer(input_size, hidden_size)
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid(),
        )
        self.num_classes = num_classes
        if num_classes:
            self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        if self.num_classes:
            logits = self.classifier(encoded)
            return decoded, logits
        else:
            return decoded

# Function to generate random array data
def generate_random_array(shape, low, high, round_val, seed=None, as_dataframe=False):
    np.random.seed(seed)
    random_array = np.random.uniform(low, high, size=shape).round(round_val)
    if as_dataframe:
        return pd.DataFrame(random_array)
    return random_array

# Function to generate random null values in the dataset
def generate_random_nulls(dataset, percentage, seed=None, as_dataframe=False):
    temp = dataset.copy()
    np.random.seed(seed)
    null_mask_indices = np.random.choice(range(temp.size), size=int(temp.size * percentage), replace=False)
    if as_dataframe:
        df_null_mask = pd.DataFrame(False, index=temp.index, columns=temp.columns)
        df_null_mask.values.flat[null_mask_indices] = True
        df_masked = temp.where(~df_null_mask)
        return df_masked
    temp.ravel()[null_mask_indices] = np.nan
    return temp


### Scenario 1: Unsupervised Learning

In [None]:
# Scenario 1: Unsupervised Learning

# Generate sample data
data = generate_random_array((100, 10), 0, 100, 2, seed=1)

# Create an instance of the Autoencoder model for Scenario 1
autoencoder_unsupervised = Autoencoder(input_size=10, hidden_size=5)

# Train the Autoencoder model
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder_unsupervised.parameters(), lr=0.001, weight_decay=1e-5)

for epoch in range(100):
    inputs = torch.Tensor(data)
    outputs = autoencoder_unsupervised(inputs)
    loss = criterion(outputs, inputs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Encode the entire dataset
encoded_data = autoencoder_unsupervised.encoder(torch.Tensor(data)).detach().numpy()

print("Encoded Data:")
print(encoded_data)

# Normalize the encoded data between 0 and 1 for each column
normalized_data = np.zeros_like(encoded_data)

for i in range(encoded_data.shape[1]):
    column = encoded_data[:, i]
    min_val = np.min(column)
    max_val = np.max(column)
    if max_val - min_val == 0:
        normalized_column = np.zeros_like(column)
    else:
        normalized_column = (column - min_val) / (max_val - min_val)
    normalized_data[:, i] = normalized_column

print("Normalized Encoded Data:")
print(normalized_data)

### Scenario 2: Supervised Learning with 1 Class

In [None]:
# Scenario 2: Supervised Learning with 1 Class

# Generate sample data
data = generate_random_array((100, 10), 0, 100, 2, seed=2)
labels = np.random.randint(0, 2, size=(100,))

# Create an instance of the Autoencoder with Classification model for Scenario 2
autoencoder_supervised_1class = Autoencoder(input_size=10, hidden_size=5, num_classes=1)

# Train the Autoencoder with Classification model
criterion = nn.MSELoss()
classification_criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(autoencoder_supervised_1class.parameters(), lr=0.001, weight_decay=1e-5)

for epoch in range(100):
    inputs = torch.Tensor(data)
    labels_tensor = torch.Tensor(labels)

    # Forward pass
    outputs, logits = autoencoder_supervised_1class(inputs)
    reconstruction_loss = criterion(outputs, inputs)
    classification_loss = classification_criterion(logits.squeeze(), labels_tensor)

    # Total loss
    loss = reconstruction_loss + classification_loss

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Encode the entire dataset
encoded_data = autoencoder_supervised_1class.encoder(torch.Tensor(data)).detach().numpy()

print("Encoded Data:")
print(encoded_data)

# Normalize the encoded data between 0 and 1 for each column
normalized_data = np.zeros_like(encoded_data)

for i in range(encoded_data.shape[1]):
    column = encoded_data[:, i]
    min_val = np.min(column)
    max_val = np.max(column)
    if max_val - min_val == 0:
        normalized_column = np.zeros_like(column)
    else:
        normalized_column = (column - min_val) / (max_val - min_val)
    normalized_column = np.round(normalized_column, decimals=4)  # Round to 4 decimal places
    normalized_data[:, i] = normalized_column

print("Normalized Encoded Data:")
print(normalized_data)

### Scenario 3: Supervised Learning with Multiple Classes (3)

In [None]:
# Scenario 3: Supervised Learning with Multiple Classes (3)

# Generate sample data
data = generate_random_array((100, 10), 0, 100, 2, seed=3)
labels = np.random.randint(0, 3, size=(100,))

# Create an instance of the Autoencoder with Classification model for Scenario 3
autoencoder_supervised_multiclass = Autoencoder(input_size=10, hidden_size=5, num_classes=3)

# Train the Autoencoder with Classification model
criterion = nn.MSELoss()
classification_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(autoencoder_supervised_multiclass.parameters(), lr=0.001, weight_decay=1e-5)

for epoch in range(100):
    inputs = torch.Tensor(data)
    labels_tensor = torch.Tensor(labels).long()

    # Forward pass
    outputs, logits = autoencoder_supervised_multiclass(inputs)
    reconstruction_loss = criterion(outputs, inputs)
    classification_loss = classification_criterion(logits.squeeze(), labels_tensor)

    # Total loss
    loss = reconstruction_loss + classification_loss

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Encode the entire dataset
encoded_data = autoencoder_supervised_multiclass.encoder(torch.Tensor(data)).detach().numpy()

print("Encoded Data:")
print(encoded_data)

# Normalize the encoded data between 0 and 1 for each column
normalized_data = np.zeros_like(encoded_data)

for i in range(encoded_data.shape[1]):
    column = encoded_data[:, i]
    min_val = np.min(column)
    max_val = np.max(column)
    if max_val - min_val == 0:
        normalized_column = np.zeros_like(column)
    else:
        normalized_column = (column - min_val) / (max_val - min_val)
    normalized_column = np.round(normalized_column, decimals=4)  # Round to 4 decimal places
    normalized_data[:, i] = normalized_column

print("Normalized Encoded Data:")
print(normalized_data)

## Step 2: Feature Screening

Screen all the features via multivariate rank distance correlation learning to select the relevant ones.

The reason for using feature screening instead of neural network is the small amount of training samples.

When training samples are limited, neural network can easily result in overfitting; however, if we only consider one feature at a time, the dependence between the feature and xencoded can still be well estimated even when the sample size is small.

In [None]:
import numpy as np
from scipy.stats import rankdata

def multivariate_rank_distance_correlation(X, Y):
    n = X.size
    ranks_X = np.apply_along_axis(rankdata, 0, X)
    ranks_Y = np.apply_along_axis(rankdata, 0, Y)
    d = np.sum((ranks_X - ranks_Y) ** 2)
    rdc = 1 - (6 * d) / (n * (n ** 2 - 1))
    return rdc

X = np.random.rand(100, 10)  # Input data matrix (100 samples, 10 features)
x_encode = np.random.rand(100, 5)  # Low-dimensional representation obtained from feature extraction (100 samples, 5 dimensions)

correlation_values = []
for feature in range(X.shape[1]):
    repeated_X = np.repeat(X[:, feature][:, np.newaxis], x_encode.shape[1], axis=1)
    rdc = multivariate_rank_distance_correlation(repeated_X, x_encode)
    correlation_values.append((feature, rdc))

sorted_correlation_values = sorted(correlation_values, key=lambda x: x[1], reverse=True)

for feature, rdc in sorted_correlation_values:
    print("Feature", feature + 1, "has Multivariate Rank Distance Correlation:", rdc)

Feature 7 has Multivariate Rank Distance Correlation: 0.96413099252397
Feature 2 has Multivariate Rank Distance Correlation: 0.9617408709634838
Feature 4 has Multivariate Rank Distance Correlation: 0.9617403909615638
Feature 8 has Multivariate Rank Distance Correlation: 0.9605682262729051
Feature 3 has Multivariate Rank Distance Correlation: 0.9597258229032917
Feature 1 has Multivariate Rank Distance Correlation: 0.9591032604130416
Feature 5 has Multivariate Rank Distance Correlation: 0.958874203496814
Feature 10 has Multivariate Rank Distance Correlation: 0.9573249492997972
Feature 9 has Multivariate Rank Distance Correlation: 0.9571961167844671
Feature 6 has Multivariate Rank Distance Correlation: 0.9550538682154729


# Final Implementation

In [None]:
# Installe libraries

!pip install minepy
!pip install hyperimpute
!pip install fancyimpute

In [21]:
import random
import numpy as np
import pandas as pd
from minepy import MINE
from hyperimpute.plugins.imputers import Imputers
from scipy.stats import rankdata

import torch
import torch.nn as nn

# Error calculation:
# For all these error metrics, smaller values are generally considered better,
# except for the R-squared metric where higher values indicate a better fit.
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import r2_score

def generate_synthetic_data(num_samples, num_class_dependent_features, num_class_independent_features, noise, low, high, round, seed):
  np.random.seed(seed)

  # Generate random class values according to a desired distribution
  class_data = np.random.uniform(low, high, size=num_samples).round(round)

  # Initialize an empty array to hold the feature data
  class_dependent_feature_data = np.zeros((num_samples, num_class_dependent_features))

  # Generate random values for each feature independently
  for i in range(num_class_dependent_features):
      class_dependent_feature_data[:, i] = np.random.uniform(low, high, size=num_samples).round(round)

  # Modify the feature values based on their relationship with the class
  for i in range(num_class_dependent_features):
      class_dependent_feature_data[:, i] += class_data * (i + 1) # You can multiply the class_data by a scaling factor to control the relationship strength

  # Create a linear combination of the last two features and add to the rest of the features
  dependent_column = np.zeros((num_class_dependent_features, 1))
  dependent_column[num_class_dependent_features-2:, 0] = 1
  new_dependent_feature = np.dot(class_dependent_feature_data, dependent_column).round(round)
  feature_data = np.column_stack((class_dependent_feature_data, new_dependent_feature))

  # Generate random independent features
  class_independent_features = np.random.rand(num_samples, num_class_independent_features)

  # Optional: Add some noise to the features to make them more diverse
  class_independent_features = (class_independent_features + np.random.normal(0, noise, class_independent_features.shape)).round(round)

  # Merge the independent features from class and feature data into a single feature_data
  feature_data = np.column_stack((feature_data, class_independent_features))

  return class_data, feature_data

def generate_random_array(shape, low, high, round, seed=None, as_dataframe=False):
    np.random.seed(seed)
    random_array = np.random.uniform(low, high, size=(shape)).round(round)
    if as_dataframe:
        return pd.DataFrame(random_array)
    return random_array

def generate_random_nulls(dataset, percentage, seed=None, as_dataframe=False):
    temp = dataset.copy()
    np.random.seed(seed)
    null_mask_indices = np.random.choice(range(temp.size), size=int(temp.size * percentage), replace=False)
    # missing_mask = np.random.rand(n_samples, n_features) < 0.2
    if as_dataframe:
        df_null_mask = pd.DataFrame(False, index=temp.index, columns=temp.columns)
        df_null_mask.values.flat[null_mask_indices] = True
        df_masked = temp.where(~df_null_mask)
        return df_masked
    temp.ravel()[null_mask_indices] = np.nan
    return temp

def pmic_feature_selection(feature_data, class_data, top=None):
    mine = MINE()
    num_features = feature_data.shape[1]
    pmic_scores = np.zeros(num_features)

    for i in range(num_features):
        musk = ~np.isnan(feature_data[:, i])
        # Select column i without null values
        feature_without_null = feature_data[musk, i]

        # Filter y based on non-null values in column i
        class_without_null = class_data[musk]

        # Calculate the MIC (Maximal Information Coefficient) score for the current feature and class
        mine.compute_score(feature_without_null, class_without_null)
        pmic_scores[i] = mine.mic()

    top_m_features_idx = np.argsort(pmic_scores)[::-1]
    return top_m_features_idx if top is None else top_m_features_idx[:top]

# R: Incomplete data matrix of shape (n, m).
# d: Rank of the non-negative latent factors.
# lambda1, lambda2: Regularization parameters.
# max_iter: Maximum number of iterations.
def Imputes_the_missing_values_By_non_negative_latent_factor(R, d, lambda1, lambda2, max_iter):
    # Initialize non-negative matrix P randomly
    n, m = R.shape
    R_copy = np.copy(R)
    P = np.random.rand(n, d)

    # Initialize non-negative matrix Q randomly
    Q = np.random.rand(d, m)

    # Initialize I according to (17)
    I = np.ones((n, m))
    R_nan_mask = np.isnan(R_copy)
    I[R_nan_mask] = 0

    # Set zero for Null
    R_copy[R_nan_mask] = 0

    # Initialize iteration counter
    iter = 0

    # Convergence criterion
    converge = False

    while not converge and iter < max_iter:
        # Update P according to (22)
        P_new = P * ((I * R_copy) @ Q.T) / ((I * (P @ Q)) @ Q.T + lambda1 * P)

        # Update Q according to (23)
        Q_new = Q * (P_new.T @ (I * R_copy)) / (P_new.T @ (I * (P_new @ Q)) + lambda2 * Q)

        # Check convergence
        if np.allclose(P, P_new) and np.allclose(Q, Q_new):
            converge = True

        # Update P and Q
        P = P_new
        Q = Q_new

        # Increment iteration counter
        iter += 1

    # Impute R by (11) and obtain R_cpl
    PQ = np.round( P @ Q, decimals=3)
    R_cpl = np.where(R_nan_mask, PQ, R)

    return R_cpl

# Encoder Layer Class
class EncoderLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLayer, self).__init__()
        self.fc = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        out = self.fc(x)
        out = self.activation(out)
        return out

# Autoencoder Class
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes=None):
        super(Autoencoder, self).__init__()
        self.encoder = EncoderLayer(input_size, hidden_size)
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid(),
        )
        self.num_classes = num_classes
        if num_classes:
            self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        if self.num_classes:
            logits = self.classifier(encoded)
            return decoded, logits
        else:
            return decoded




# Implementing 3 Scenario Functions with Switch Case:
# Scenario 1: Unsupervised Learning
# Scenario 2: Supervised Learning with 1 Class
# Scenario 3: Supervised Learning with Multiple Classes (3)


# Implement: remove redundant features method for feature screening
# correlation matrix
# https://poe.com/s/UIk3K6D0l8RHdzvkTHNi
# https://poe.com/s/BePgttFGioya3D2Yuwua



def multivariate_rank_distance_correlation(X, Y):
    n = X.size
    ranks_X = np.apply_along_axis(rankdata, 0, X)
    ranks_Y = np.apply_along_axis(rankdata, 0, Y)
    d = np.sum((ranks_X - ranks_Y) ** 2)
    rdc = 1 - (6 * d) / (n * (n ** 2 - 1))
    return rdc

def select_top_sorted_rank(orginal_data, encode_data, top=None):
    correlation_values = []
    for feature in range(X.shape[1]):
        repeated_X = np.repeat(X[:, feature][:, np.newaxis], x_encode.shape[1], axis=1)
        rdc = multivariate_rank_distance_correlation(repeated_X, x_encode)
        correlation_values.append((feature, rdc))

    sorted_correlation_values = sorted(correlation_values, key=lambda x: x[1], reverse=True)
    for feature, rdc in sorted_correlation_values:
        print("Feature", feature + 1, "has Multivariate Rank Distance Correlation:", rdc)
    # return sorted_correlation_values if top is None else sorted_correlation_values[:top]

# Implement final evaluation method

In [None]:
# Use Methods and evaluation and stor result in csv

# ارزیابی

In [19]:
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
import numpy as np
from sklearn.datasets import load_iris, load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

results_df = pd.DataFrame(columns=['Dataset', 'Model', 'Missing Rate', 'MSE', 'MAE', 'RMSE', 'Accuracy', 'Precision', 'Recall', 'F1'])

# تابعی برای ایجاد m ویژگی مصنوعی
def create_synthetic_features(n_samples, m_features, random_state=None):
    np.random.seed(random_state)
    X_synthetic = np.random.rand(n_samples, m_features)
    return X_synthetic


# تابعی برای افزودن ویژگی‌های مصنوعی به داده‌های موجود
def add_synthetic_features(X_original, X_synthetic, shuffle_columns=True):
    # افزودن ویژگی‌های مصنوعی
    combined_data = np.hstack((X_original, X_synthetic))
    if  shuffle_columns :
      # تولید اندیس‌های شافل شده برای ستون‌ها
      shuffled_indices = np.random.permutation(combined_data.shape[1])
      # اعمال شافل شده بر روی ستون‌ها
      shuffled_data = combined_data[:, shuffled_indices]
    return shuffled_data

# تابعی برای ارزیابی مدل دسته‌بندی
def evaluate_classification_model(dataset_name, missing_rate, X, y, feature_selection=False, k=0):
    # تقسیم داده‌ها
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # انتخاب برترین ویژگی‌ها در صورت نیاز
    if feature_selection:
        selector = SelectKBest(f_classif, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)
    else:
        X_train_selected = X_train
        X_test_selected = X_test

    # آموزش مدل
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_selected, y_train)

    # ارزیابی عملکرد
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    results_df.loc[len(results_df)] = [dataset_name, 'RandomForestClassifier', missing_rate, None, None, None, accuracy, precision, recall, f1]

# تابعی برای ارزیابی دیتاست‌های رگرسیون با قابلیت انتخاب ویژگی
def evaluate_regression_model(dataset_name, missing_rate, X, y, feature_selection=False, k=0):
    # تقسیم داده‌ها
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # انتخاب برترین ویژگی‌ها در صورت نیاز
    if feature_selection:
        selector = SelectKBest(f_regression, k=k)
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_test_selected = selector.transform(X_test)
    else:
        X_train_selected = X_train
        X_test_selected = X_test

    # آموزش مدل
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_selected, y_train)

    # ارزیابی عملکرد
    y_pred = model.predict(X_test_selected)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"MSE: {mse:.2f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")
    results_df.loc[len(results_df)] = [dataset_name, 'RandomForestRegressor', missing_rate, mse, mae, rmse, None, None, None, None]

def make_data_missing(X, missing_rate, random_state=42):
    np.random.seed(random_state)
    X_missing = X.copy()
    missing_entries = np.random.binomial(1, p=missing_rate, size=X.shape).astype(bool)
    X_missing[missing_entries] = np.nan
    return X_missing

datasets = {
    'iris': (load_iris(return_X_y=True), 'classification', 4),
    'diabetes': (load_diabetes(return_X_y=True), 'regression', 10),
    'breast_cancer': (load_breast_cancer(return_X_y=True), 'classification', 30)
}

# m ویژگی مصنوعی که مایلید اضافه کنید
m_features = 5

# نرخ‌های داده گمشده برای آزمایش
missing_rates = [0.3, 0.4, 0.5, 0.6]

for name, ((X, y), task_type, dimensionality) in datasets.items():
    k = dimensionality
    n_samples = X.shape[0]
    # ایجاد ویژگی‌های مصنوعی و اضافه کردن آن‌ها به داده‌های موجود
    X_synthetic = create_synthetic_features(n_samples, m_features, random_state=42)
    X_extended = add_synthetic_features(X, X_synthetic)
    for missing_rate in missing_rates:
        print(f'\n{name.capitalize()} dataset with {missing_rate * 100}% missing data:')
        X_missing = make_data_missing(X_extended, missing_rate)
        imputer = SimpleImputer(strategy='mean')
        X_imputed = imputer.fit_transform(X_missing)

        if task_type == 'classification':
            evaluate_classification_model(f'{name.capitalize()}',f'{missing_rate * 100}%', X_imputed, y, feature_selection=True, k=k)
        else:
            evaluate_regression_model(f'{name.capitalize()}',f'{missing_rate * 100}%', X_imputed, y, feature_selection=True, k=k)

print(results_df)
results_df.to_csv('evaluation_results.csv', index=False)  # ذخیره در یک فایل CSV


Iris dataset with 30.0% missing data:
Accuracy: 0.90, Precision: 0.93, Recall: 0.89, F1 Score: 0.90

Iris dataset with 40.0% missing data:
Accuracy: 0.90, Precision: 0.93, Recall: 0.90, F1 Score: 0.90

Iris dataset with 50.0% missing data:
Accuracy: 0.80, Precision: 0.81, Recall: 0.80, F1 Score: 0.80

Iris dataset with 60.0% missing data:
Accuracy: 0.83, Precision: 0.88, Recall: 0.84, F1 Score: 0.84

Diabetes dataset with 30.0% missing data:
MSE: 3346.36, MAE: 47.92, RMSE: 57.85

Diabetes dataset with 40.0% missing data:
MSE: 3266.15, MAE: 47.80, RMSE: 57.15

Diabetes dataset with 50.0% missing data:
MSE: 3919.92, MAE: 50.37, RMSE: 62.61

Diabetes dataset with 60.0% missing data:
MSE: 4598.29, MAE: 54.00, RMSE: 67.81

Breast_cancer dataset with 30.0% missing data:
Accuracy: 0.97, Precision: 0.98, Recall: 0.97, F1 Score: 0.97

Breast_cancer dataset with 40.0% missing data:
Accuracy: 0.96, Precision: 0.97, Recall: 0.96, F1 Score: 0.96

Breast_cancer dataset with 50.0% missing data:
Accu