In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [9]:
# Load the current dataset
df = pd.read_csv('physionet_39_features.csv')
print(f"Dataset shape: {df.shape}")

Dataset shape: (2008, 39)


In [10]:
# Separate the target variable
target_column = 're.admission.within.6.months'
print(f"Target column '{target_column}' exists in dataset: {target_column in df.columns}")

Target column 're.admission.within.6.months' exists in dataset: True


In [11]:
# Create a new dataset with exactly 39 features (excluding the target variable)
features_df = df.drop(columns=[target_column])

# If we now have 38 features, we need to add one more
if features_df.shape[1] < 39:
    print(f"Current feature count: {features_df.shape[1]}, need to add {39 - features_df.shape[1]} more features")
    
    # We could consider adding another relevant feature from the original dataset here
    # For now, we'll keep the existing features
    
    # Alternatively, if adding a feature is not possible, we could use all 38
    print("Will proceed with available features")


Current feature count: 38, need to add 1 more features
Will proceed with available features


In [12]:
# If we somehow have more than 39 features, keep only the first 39
if features_df.shape[1] > 39:
    print(f"Current feature count: {features_df.shape[1]}, need to remove {features_df.shape[1] - 39}")
    # Keep only the first 39 columns
    features_df = features_df.iloc[:, :39]

# Create a target df with just the target variable
target_df = df[[target_column]]

In [13]:
# Save dataset with just the features (for imputation evaluation)
features_df.to_csv('physionet_features_only.csv', index=False)
print(f"Features-only dataset shape: {features_df.shape}")

# Save dataset with features and target (for post-imputation prediction)
features_and_target_df = pd.concat([features_df, target_df], axis=1)
features_and_target_df.to_csv('physionet_features_and_target.csv', index=False)
print(f"Features and target dataset shape: {features_and_target_df.shape}")

# Verify column counts
print(f"Number of columns in features dataset: {len(features_df.columns)}")
print(f"Number of columns in features+target dataset: {len(features_and_target_df.columns)}")

# Print column names of both datasets
print("\nFeatures dataset columns:")
print(features_df.columns.tolist())

print("\nFeatures+target dataset columns:")
print(features_and_target_df.columns.tolist())

Features-only dataset shape: (2008, 38)
Features and target dataset shape: (2008, 39)
Number of columns in features dataset: 38
Number of columns in features+target dataset: 39

Features dataset columns:
['verbal.response', 'eye.opening', 'movement', 'GCS', 'lactate.dehydrogenase', 'glutamic.oxaloacetic.transaminase', 'globulin', 'total.bilirubin', 'direct.bilirubin', 'indirect.bilirubin', 'hemoglobin', 'hematocrit', 'red.blood.cell', 'map', 'return.to.emergency.department.within.6.months', 'death.within.6.months', 'death.within.28.days', 're.admission.within.3.months', 'death.within.3.months', 'hydroxybutyrate.dehydrogenase', 'cholesterol', 'low.density.lipoprotein.cholesterol', 'glutamyltranspeptidase', 'nucleotidase', 'white.globulin.ratio', 'glutamic.pyruvic.transaminase', 'total.protein', 'international.normalized.ratio', 'prothrombin.time.ratio', 'mean.corpuscular.volume', 'mean.hemoglobin.volume', 'neutrophil.count', 'white.blood.cell', 'platelet', 'platelet.hematocrit', 'eosino

In [None]:
import pandas as pd
import numpy as np

# Load the current dataset
df = pd.read_csv('physionet_39_features.csv')
print(f"Dataset shape: {df.shape}")

# Separate the target variable
target_column = 're.admission.within.6.months'
print(f"Target column '{target_column}' exists in dataset: {target_column in df.columns}")

# Create a new dataset with features (excluding the target variable)
features_df = df.drop(columns=[target_column])

# If we now have 38 features, we need to add one more to reach 39
if features_df.shape[1] < 39:
    print(f"Current feature count: {features_df.shape[1]}, need to add {39 - features_df.shape[1]} more features")
    
    # Add a new derived feature to reach 39 features
    # Create a meaningful feature: red-to-white blood cell ratio
    if 'red.blood.cell' in features_df.columns and 'white.blood.cell' in features_df.columns:
        features_df['red.to.white.cell.ratio'] = features_df['red.blood.cell'] / features_df['white.blood.cell']
        print("Added 'red.to.white.cell.ratio' as the 39th feature")
    # Alternative: calculate lab value ratios
    elif 'hemoglobin' in features_df.columns and 'hematocrit' in features_df.columns:
        features_df['hemoglobin.to.hematocrit.ratio'] = features_df['hemoglobin'] / features_df['hematocrit']
        print("Added 'hemoglobin.to.hematocrit.ratio' as the 39th feature")
    # If neither option is available, create a compound feature from existing ones
    else:
        # Use columns that would likely have a meaningful relationship
        numeric_cols = features_df.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) >= 2:
            col1, col2 = numeric_cols[0], numeric_cols[1]
            features_df[f'{col1}_to_{col2}_ratio'] = features_df[col1] / features_df[col2]
            print(f"Added '{col1}_to_{col2}_ratio' as the 39th feature")

# If we somehow have more than 39 features, keep only the first 39
if features_df.shape[1] > 39:
    print(f"Current feature count: {features_df.shape[1]}, need to remove {features_df.shape[1] - 39}")
    # Keep only the first 39 columns
    features_df = features_df.iloc[:, :39]

# Create a target df with just the target variable
target_df = df[[target_column]]

# Save dataset with just the features (for imputation evaluation)
features_df.to_csv('physionet_39_features_only.csv', index=False)
print(f"Features-only dataset shape: {features_df.shape}")

# Save dataset with features and target (for post-imputation prediction)
features_and_target_df = pd.concat([features_df, target_df], axis=1)
features_and_target_df.to_csv('physionet_39_features_and_target.csv', index=False)
print(f"Features and target dataset shape: {features_and_target_df.shape}")

# Verify column counts
print(f"Number of columns in features dataset: {len(features_df.columns)}")
print(f"Number of columns in features+target dataset: {len(features_and_target_df.columns)}")

# Print first few columns of both datasets
print("\nFirst 5 columns of features dataset:")
print(features_df.columns[:5].tolist())

print("\nLast 5 columns of features dataset:")
print(features_df.columns[-5:].tolist())

print("\nLast 5 columns of features+target dataset:")
print(features_and_target_df.columns[-5:].tolist())

# Check if we've achieved exactly 39 features
if len(features_df.columns) == 39:
    print("\nSuccessfully created dataset with exactly 39 features!")
else:
    print(f"\nWarning: Feature dataset has {len(features_df.columns)} features instead of 39")

# Check if the target is correctly added as the 40th column in the second dataset
if len(features_and_target_df.columns) == 40 and features_and_target_df.columns[-1] == target_column:
    print("Successfully created features+target dataset with the target as the 40th column!")
else:
    print("Warning: Issue with the features+target dataset structure")

Dataset shape: (2008, 39)
Target column 're.admission.within.6.months' exists in dataset: True
Current feature count: 38, need to add 1 more features
Added 'red.to.white.cell.ratio' as the 39th feature
Features-only dataset shape: (2008, 39)
Features and target dataset shape: (2008, 40)
Number of columns in features dataset: 39
Number of columns in features+target dataset: 40

First 5 columns of features dataset:
['verbal.response', 'eye.opening', 'movement', 'GCS', 'lactate.dehydrogenase']

Last 5 columns of features dataset:
['platelet.hematocrit', 'eosinophil.ratio', 'eosinophil.count', 'basophil.ratio', 'red.to.white.cell.ratio']

Last 5 columns of features+target dataset:
['eosinophil.ratio', 'eosinophil.count', 'basophil.ratio', 'red.to.white.cell.ratio', 're.admission.within.6.months']

Successfully created dataset with exactly 39 features!
Successfully created features+target dataset with the target as the 40th column!
