## Data manipulation

In this notebook the data will be manipulated to replicate the mechanisms mentioned in: <br>
<i>Psychogyios, K. et al. (2023) ‘Missing Value Imputation Methods for Electronic Health Records’, IEEE Access, 11, pp. 21562–21574. Available at: https://doi.org/10.1109/ACCESS.2023.3251919.</i>

In [None]:
import pandas as pd

df = pd.read_csv('dat.csv', encoding='utf-8')
print(f"Original dataset shape: {df.shape}")

target_column = 're.admission.within.6.months'

Original dataset shape: (2008, 167)


In [24]:
# Step 1: Keep only numerical features
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
df_numerical = df[numerical_cols]

In [None]:
# Step 2: Remove columns with >30% missing values
missing_percentage = df_numerical.isnull().mean()
columns_to_keep = missing_percentage[missing_percentage <= 0.3].index.tolist()
df_filtered = df_numerical[columns_to_keep]

In [26]:
# Step 3: Calculate correlation matrix to select highly correlated features
correlation_matrix = df_filtered.corr().abs()

In [27]:
# Step 4: Select top 39 features (based on correlation with other features)
# Calculate average correlation for each feature
avg_correlation = correlation_matrix.mean()
top_features = avg_correlation.sort_values(ascending=False).head(39).index.tolist()

In [28]:
# Create Dataset 1: 39 feature columns only (for imputation evaluation)
dataset_1 = df_filtered[top_features].copy()
print(f"\nDataset 1 shape (39 features only): {dataset_1.shape}")
print("Top 5 features selected:", top_features[:5])


Dataset 1 shape (39 features only): (2008, 39)
Top 5 features selected: ['urea', 'direct.bilirubin', 'lactate.dehydrogenase', 'hemoglobin', 'prothrombin.activity']


In [29]:
# Create Dataset 2: 39 features + target column (for prediction evaluation)
# Check if target column exists in the dataframe
if target_column in df.columns:
    dataset_2 = dataset_1.copy()
    dataset_2[target_column] = df[target_column]
    print(f"\nDataset 2 shape (39 features + target): {dataset_2.shape}")
else:
    print(f"\nTarget column '{target_column}' not found in the dataset.")
    dataset_2 = None


Dataset 2 shape (39 features + target): (2008, 40)


In [30]:
# Save datasets to CSV
dataset_1.to_csv('dataset_1_39_features.csv', index=False)
print("\nDataset 1 saved as 'dataset_1_39_features.csv'")


Dataset 1 saved as 'dataset_1_39_features.csv'


In [31]:
if dataset_2 is not None:
    dataset_2.to_csv('dataset_2_39_features_plus_target.csv', index=False)
    print("Dataset 2 saved as 'dataset_2_39_features_plus_target.csv'")

Dataset 2 saved as 'dataset_2_39_features_plus_target.csv'
