In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import os
from sklearn.preprocessing import StandardScaler

# Step 1: Load the original Physionet heart failure dataset
def load_physionet_data(file_path):
   """Load the original Physionet heart failure dataset with 167 variables"""
   df = pd.read_csv(file_path)
   print(f"Original dataset shape: {df.shape}")
   return df

# Step 2: Preprocess to get 39 numerical features
def preprocess_physionet(df, missingness_threshold=0.3):
   """
   Preprocess the Physionet dataset to:
   1. Keep only numerical variables
   2. Drop columns with > 30% missingness
   3. Keep highly correlated features
   """
   # Select only numerical columns
   num_cols = df.select_dtypes(include=['float64', 'int64']).columns
   df_num = df[num_cols]
   print(f"Numerical features: {len(num_cols)}")
   
   # Calculate missingness percentage for each column
   missingness = df_num.isnull().mean()
   
   # Drop columns with missingness > threshold
   cols_to_keep = missingness[missingness <= missingness_threshold].index
   df_filtered = df_num[cols_to_keep]
   print(f"Features after dropping high missingness: {df_filtered.shape[1]}")
   
   # Filter for highly correlated features 
   # First, fill NaNs temporarily for correlation calculation
   temp_df = df_filtered.copy()
   imputer = SimpleImputer(strategy='mean')
   temp_df = pd.DataFrame(
       imputer.fit_transform(temp_df), 
       columns=temp_df.columns
   )
   
   # Calculate correlation matrix
   corr_matrix = temp_df.corr().abs()
   
   # Find features with at least one strong correlation to another feature
   has_strong_corr = (corr_matrix > 0.6).sum() > 1
   strongly_correlated_features = has_strong_corr[has_strong_corr].index.tolist()
   
   # Keep only strongly correlated features
   df_final = df_filtered[strongly_correlated_features]
   
   # Add target variable if it exists
   if 'readmission_within_6months' in df.columns:
       df_final['readmission_within_6months'] = df['readmission_within_6months']
   
   print(f"Final dataset shape: {df_final.shape}")
   return df_final

# Step 3: Create a complete dataset (no missing values)
def create_complete_dataset(df):
   """
   Create a complete dataset with no missing values
   """
   # Separate target if it exists
   if 'readmission_within_6months' in df.columns:
       y = df['readmission_within_6months']
       X = df.drop(columns=['readmission_within_6months'])
   else:
       X = df
       y = None
   
   # Impute any remaining missing values to create a complete dataset
   imputer = SimpleImputer(strategy='mean')
   X_complete = pd.DataFrame(
       imputer.fit_transform(X),
       columns=X.columns
   )
   
   # Recombine with target
   if y is not None:
       X_complete['readmission_within_6months'] = y
   
   print(f"Complete dataset created with shape: {X_complete.shape}")
   return X_complete

# Step 4: Create datasets with artificial missingness
def create_missing_datasets(df_complete, mechanism='MCAR', proportions=[0.1, 0.2, 0.3, 0.4, 0.5]):
   """
   Create multiple datasets with artificial missingness manually
   Args:
       df_complete: Complete dataset with no missing values
       mechanism: 'MCAR', 'MAR', or 'MNAR'
       proportions: List of missingness proportions to generate
   """
   # Separate target if it exists
   if 'readmission_within_6months' in df_complete.columns:
       y = df_complete['readmission_within_6months']
       X = df_complete.drop(columns=['readmission_within_6months'])
   else:
       X = df_complete
       y = None
   
   # Store datasets with missingness
   missing_datasets = {}
   
   for prop in proportions:
       # Create a copy of the data for this missingness proportion
       X_missing = X.copy()
       
       # Total number of values that should be missing
       n_samples, n_features = X_missing.shape
       n_missing = int(np.round(prop * n_samples * n_features))
       
       if mechanism == 'MCAR':
           # Missing Completely At Random
           # Randomly select cells to be missing
           flat_indices = np.random.choice(n_samples * n_features, size=n_missing, replace=False)
           row_indices = flat_indices // n_features
           col_indices = flat_indices % n_features
           
           for row, col in zip(row_indices, col_indices):
               X_missing.iloc[row, col] = np.nan
               
       elif mechanism == 'MAR':
           # Missing At Random
           # Missingness depends on observed values in other columns
           
           # Randomly select half the features to have missingness
           features_with_missingness = np.random.choice(
               n_features, 
               n_features // 2, 
               replace=False
           )
           
           # For each feature with missingness, make it depend on values in another feature
           for feature_idx in features_with_missingness:
               # Select a different feature this depends on
               predictor_feature = np.random.choice(
                   [i for i in range(n_features) if i != feature_idx]
               )
               
               # Sort data by the predictor feature
               sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
               
               # Number of missing values for this feature
               n_to_miss = int(np.round(prop * n_samples))
               
               # Make values missing based on sorted predictor
               # (e.g., values with highest predictor value are missing)
               indices_to_miss = sorted_indices[-n_to_miss:]
               
               # Set values to missing
               X_missing.iloc[indices_to_miss, feature_idx] = np.nan
               
       elif mechanism == 'MNAR':
           # Missing Not At Random
           # Missingness depends on the values themselves
           
           # Randomly select half the features to have missingness
           features_with_missingness = np.random.choice(
               n_features, 
               n_features // 2, 
               replace=False
           )
           
           # For each feature, make missingness depend on its own value
           for feature_idx in features_with_missingness:
               # Sort data by this feature itself
               sorted_indices = X_missing.iloc[:, feature_idx].argsort()
               
               # Number of missing values for this feature
               n_to_miss = int(np.round(prop * n_samples))
               
               # Make highest values missing (common in real-world data)
               indices_to_miss = sorted_indices[-n_to_miss:]
               
               # Set values to missing
               X_missing.iloc[indices_to_miss, feature_idx] = np.nan
       
       # Add target back if it exists
       if y is not None:
           X_missing['readmission_within_6months'] = y
       
       # Store dataset
       missing_datasets[prop] = X_missing
       print(f"Created {mechanism} dataset with {prop*100}% missingness")
   
   return missing_datasets

# Main process
def main():
   # Paths
   data_dir = "data"
   os.makedirs(data_dir, exist_ok=True)
   
   # File paths
   # Replace with actual path to your Physionet data
   input_path = "./data/original.csv"
   processed_path = os.path.join(data_dir, "physionet_processed.csv")
   complete_path = os.path.join(data_dir, "physionet_complete.csv")
   
   # Load and preprocess the data
   df_original = load_physionet_data(input_path)
   df_processed = preprocess_physionet(df_original)
   df_processed.to_csv(processed_path, index=False)
   print(f"Saved processed dataset to {processed_path}")
   
   # Create complete dataset
   df_complete = create_complete_dataset(df_processed)
   df_complete.to_csv(complete_path, index=False)
   print(f"Saved complete dataset to {complete_path}")
   
   # Create missing datasets for each mechanism
   mechanisms = ['MCAR', 'MAR', 'MNAR']
   proportions = [0.1, 0.2, 0.3, 0.4, 0.5]
   
   for mechanism in mechanisms:
       missing_datasets = create_missing_datasets(
           df_complete, 
           mechanism=mechanism,
           proportions=proportions
       )
       
       # Save each missing dataset
       for prop, df in missing_datasets.items():
           filename = f"physionet_{mechanism}_{int(prop*100)}pct_missing.csv"
           output_path = os.path.join(data_dir, filename)
           df.to_csv(output_path, index=False)
           print(f"Saved {mechanism} {prop*100}% missing dataset to {output_path}")

if __name__ == "__main__":
   main()

Original dataset shape: (2008, 167)
Numerical features: 152
Features after dropping high missingness: 108
Final dataset shape: (2008, 56)
Saved processed dataset to data\physionet_processed.csv
Complete dataset created with shape: (2008, 56)
Saved complete dataset to data\physionet_complete.csv
Created MCAR dataset with 10.0% missingness
Created MCAR dataset with 20.0% missingness
Created MCAR dataset with 30.0% missingness
Created MCAR dataset with 40.0% missingness
Created MCAR dataset with 50.0% missingness
Saved MCAR 10.0% missing dataset to data\physionet_MCAR_10pct_missing.csv
Saved MCAR 20.0% missing dataset to data\physionet_MCAR_20pct_missing.csv
Saved MCAR 30.0% missing dataset to data\physionet_MCAR_30pct_missing.csv
Saved MCAR 40.0% missing dataset to data\physionet_MCAR_40pct_missing.csv
Saved MCAR 50.0% missing dataset to data\physionet_MCAR_50pct_missing.csv
Created MAR dataset with 10.0% missingness
Created MAR dataset with 20.0% missingness
Created MAR dataset with 30.

  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_indices = X_missing.iloc[:, predictor_feature].argsort()
  sorted_i

Saved MAR 30.0% missing dataset to data\physionet_MAR_30pct_missing.csv
Saved MAR 40.0% missing dataset to data\physionet_MAR_40pct_missing.csv
Saved MAR 50.0% missing dataset to data\physionet_MAR_50pct_missing.csv
Created MNAR dataset with 10.0% missingness
Created MNAR dataset with 20.0% missingness
Created MNAR dataset with 30.0% missingness
Created MNAR dataset with 40.0% missingness
Created MNAR dataset with 50.0% missingness
Saved MNAR 10.0% missing dataset to data\physionet_MNAR_10pct_missing.csv
Saved MNAR 20.0% missing dataset to data\physionet_MNAR_20pct_missing.csv
Saved MNAR 30.0% missing dataset to data\physionet_MNAR_30pct_missing.csv
Saved MNAR 40.0% missing dataset to data\physionet_MNAR_40pct_missing.csv
Saved MNAR 50.0% missing dataset to data\physionet_MNAR_50pct_missing.csv
