In [3]:
# --- Notebook 02: Data Preprocessing (V2 - Robust) ---
# Goal: Load raw data, clean it, handle missing columns, and save a final dataset.

import pandas as pd
import numpy as np
import os

# --- Step 1: Define Paths and Features ---
data_dir = os.path.join('..', 'data', 'raw', 'physionet.org', 'files', 'challenge-2019', '1.0.0', 'training', 'training_setA')
all_patient_files = os.listdir(data_dir)
processed_dir = os.path.join('..', 'data', 'processed')
os.makedirs(processed_dir, exist_ok=True)

features_to_keep = [
    'HR', 'O2Sat', 'SBP', 'MAP', 'Resp', 'Age', 'Gender', 'ICULOS'
]
all_cols = ['PatientID'] + features_to_keep

# --- Step 2: Load, Clean, and Impute Data (Now with Robust Re-indexing) ---
list_of_cleaned_dfs = []
print(f"Starting robust preprocessing of {len(all_patient_files)} patient files...")

for i, filename in enumerate(all_patient_files):
    full_path = os.path.join(data_dir, filename)
    df = pd.read_csv(full_path, sep='|')
    
    patient_id = filename.split('.')[0]
    df['PatientID'] = patient_id
    
    # --- THIS IS THE FIX ---
    # Use .reindex() to safely select columns. If a column is missing in a file,
    # it will be created and filled with NaN, which we handle next.
    df_subset = df.reindex(columns=all_cols).copy()
    
    # Impute missing values for THIS patient only
    df_filled = df_subset.ffill().bfill()
    df_filled.fillna(0, inplace=True)

    list_of_cleaned_dfs.append(df_filled)

    if (i + 1) % 2000 == 0:
        print(f"  ... preprocessed {i + 1} / {len(all_patient_files)} files")

print("...combining all patient data into final dataframe.")
cleaned_df = pd.concat(list_of_cleaned_dfs, ignore_index=True)

# --- Step 3: Save the Final, Cleaned Dataset ---
output_path = os.path.join(processed_dir, 'cleaned_sepsis_data.parquet')
cleaned_df.to_parquet(output_path)
print(f"\nFinal, cleaned dataset saved successfully to:\n{output_path}")

# --- Step 4: Final Verification ---
print("\n--- Final Cleaned Dataset ---")
print("Shape of the final dataframe:", cleaned_df.shape)
print("\nFinal check for any remaining missing values:")
print(cleaned_df.isnull().sum())
print("\nFirst 5 rows of the final, cleaned dataset:")
display(cleaned_df.head())

Starting robust preprocessing of 20335 patient files...
  ... preprocessed 2000 / 20335 files
  ... preprocessed 4000 / 20335 files
  ... preprocessed 6000 / 20335 files
  ... preprocessed 8000 / 20335 files
  ... preprocessed 10000 / 20335 files
  ... preprocessed 12000 / 20335 files
  ... preprocessed 14000 / 20335 files
  ... preprocessed 16000 / 20335 files
  ... preprocessed 18000 / 20335 files
  ... preprocessed 20000 / 20335 files
...combining all patient data into final dataframe.


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
# --- Step 3 (Retry): Save the Final, Cleaned Dataset ---

# The 'cleaned_df' variable still exists in memory from the cell above.
# Now that pyarrow is installed, this command will work.

output_path = os.path.join(processed_dir, 'cleaned_sepsis_data.parquet')
cleaned_df.to_parquet(output_path)

print(f"\nFinal, cleaned dataset saved successfully to:\n{output_path}")

# --- Step 4: Final Verification ---
print("\n--- Final Cleaned Dataset ---")
print("Shape of the final dataframe:", cleaned_df.shape)
print("\nFinal check for any remaining missing values:")
print(cleaned_df.isnull().sum())
print("\nFirst 5 rows of the final, cleaned dataset:")
display(cleaned_df.head())