In [1]:
import pandas as pd
import os

# Define the path to the data. 
# The '..' means go up one directory from 'notebooks' to the main BME folder.
# The files are inside the 'physionet.org/files/...' folder created by wget.
data_dir = os.path.join('..', 'data', 'raw', 'physionet.org', 'files', 'challenge-2019', '1.0.0', 'training', 'training_setA')

# Let's just load the first patient file to start.
first_patient_file = 'p000001.psv'
file_path = os.path.join(data_dir, first_patient_file)

# Load the data. The files are separated by a pipe '|' character.
patient_df = pd.read_csv(file_path, sep='|')

# Display the first few rows to see what it looks like
print(f"Displaying data for patient: {first_patient_file}")
display(patient_df.head())

# Display a summary of the columns, missing values, and data types
print("\nData summary:")
patient_df.info()

Displaying data for patient: p000001.psv


Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,,,,,,,,,,,...,,,,83.14,0,,,-0.03,1,0
1,97.0,95.0,,98.0,75.33,,19.0,,,,...,,,,83.14,0,,,-0.03,2,0
2,89.0,99.0,,122.0,86.0,,22.0,,,,...,,,,83.14,0,,,-0.03,3,0
3,90.0,95.0,,,,,30.0,,24.0,,...,,,,83.14,0,,,-0.03,4,0
4,103.0,88.5,,122.0,91.33,,24.5,,,,...,,,,83.14,0,,,-0.03,5,0



Data summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HR                49 non-null     float64
 1   O2Sat             44 non-null     float64
 2   Temp              10 non-null     float64
 3   SBP               42 non-null     float64
 4   MAP               42 non-null     float64
 5   DBP               0 non-null      float64
 6   Resp              50 non-null     float64
 7   EtCO2             0 non-null      float64
 8   BaseExcess        7 non-null      float64
 9   HCO3              2 non-null      float64
 10  FiO2              4 non-null      float64
 11  pH                7 non-null      float64
 12  PaCO2             6 non-null      float64
 13  SaO2              4 non-null      float64
 14  AST               1 non-null      float64
 15  BUN               2 non-null      float64
 16  Alkalinephos      1 non-null   

In [None]:
# --- Part 2 (V3 - Corrected and Memory-Efficient) ---
import pandas as pd
import os

# Define the data directory
data_dir = os.path.join('..', 'data', 'raw', 'physionet.org', 'files', 'challenge-2019', '1.0.0', 'training', 'training_setA')
all_patient_files = os.listdir(data_dir)

# Initialize variables to store our summary counts
list_of_summaries = []
total_rows_processed = 0

print(f"Starting memory-efficient analysis of {len(all_patient_files)} files...")
print("This will take several minutes, but it will not crash. Please be patient.")

# Loop through every file
for i, filename in enumerate(all_patient_files):
    full_path = os.path.join(data_dir, filename)
    
    # Load one file
    df = pd.read_csv(full_path, sep='|')
    
    # Accumulate the counts we need
    total_rows_processed += len(df)
    list_of_summaries.append(df.notnull().sum()) # Store only the small summary

    # After processing, the large 'df' is automatically discarded from memory
    # before the next loop starts. This is why it's memory-safe.

    # Print progress so you know it's working
    if (i + 1) % 2000 == 0:
        print(f"  ... processed {i + 1} / {len(all_patient_files)} files")

print("...analysis complete. Combining summaries...")

# Combine the small summaries (this is memory-safe)
summary_counts_df = pd.DataFrame(list_of_summaries)

# Sum the counts across all files
total_non_missing_counts = summary_counts_df.sum()

# Calculate the final missing percentage
missing_percentage = 100 * (1 - (total_non_missing_counts / total_rows_processed))

# Sort and display the results
sorted_missing = missing_percentage.sort_values(ascending=False)

print("\n--- Analysis of ALL Training Set A Patients ---")
print("\nPercentage of missing data per column (Top 20):")
display(sorted_missing.head(20))

print("\nPercentage of missing data per column (Bottom 10 - most complete):")
display(sorted_missing.tail(10))

In [4]:
# --- Part 3: Preprocessing and Feature Selection (Final Corrected Code) ---

# 1. Define the columns we want to keep
core_features = [
    'PatientID', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 
    'Age', 'Gender', 'ICULOS'
]

# 2. Create a new dataframe with only these columns
preprocessed_df = full_df[core_features].copy()

# 3. Impute missing values using forward-fill then backward-fill for each patient.
# This single line is the most robust way to do this.
print("Imputing missing values...")
preprocessed_df = preprocessed_df.groupby('PatientID').apply(lambda g: g.ffill().bfill())

# The operation above might create a multi-level index. Let's clean it up.
preprocessed_df = preprocessed_df.reset_index(level=0, drop=True)


# 4. Verify the result
print("\nData summary after imputation:")
preprocessed_df.info()

print("\nRemaining missing values after imputation:")
print(preprocessed_df.isnull().sum())

Imputing missing values...

Data summary after imputation:
<class 'pandas.core.frame.DataFrame'>
Index: 59114 entries, 0 to 59113
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   PatientID  59114 non-null  object 
 1   HR         38773 non-null  float64
 2   O2Sat      38698 non-null  float64
 3   Temp       38594 non-null  float64
 4   SBP        38672 non-null  float64
 5   MAP        38758 non-null  float64
 6   DBP        26440 non-null  float64
 7   Resp       38626 non-null  float64
 8   Age        38773 non-null  float64
 9   Gender     38773 non-null  float64
 10  ICULOS     38773 non-null  float64
dtypes: float64(10), object(1)
memory usage: 5.4+ MB

Remaining missing values after imputation:
PatientID        0
HR           20341
O2Sat        20416
Temp         20520
SBP          20442
MAP          20356
DBP          32674
Resp         20488
Age          20341
Gender       20341
ICULOS       20341
dtype: int6

  preprocessed_df = preprocessed_df.groupby('PatientID').apply(lambda g: g.ffill().bfill())


In [6]:
# --- Part 4: Final Cleaning ---

# Let's see how many patients we have before the final drop
num_patients_before = preprocessed_df['PatientID'].nunique()
print(f"Number of unique patients before final cleaning: {num_patients_before}")

# Drop any rows that STILL have missing values in any of our core features
# This effectively removes patients who had no measurements for one or more vitals.
final_df = preprocessed_df.dropna()

# Let's see how many patients remain
num_patients_after = final_df['PatientID'].nunique()
print(f"Number of unique patients after final cleaning: {num_patients_after}")
print(f"Number of patients removed: {num_patients_before - num_patients_after}")


# Verify that there are no more missing values
print("\nData summary of the final, clean dataset:")
final_df.info()

# Let's save this clean dataframe to the 'processed' data folder.
# This is Task 2.4 from your README.
print("\nSaving cleaned data to the 'processed' directory...")
processed_data_path = os.path.join('..', 'data', 'processed', 'cleaned_sepsis_data.csv')
final_df.to_csv(processed_data_path, index=False)
print(f"Data saved to: {processed_data_path}")

Number of unique patients before final cleaning: 1000
Number of unique patients after final cleaning: 637
Number of patients removed: 363

Data summary of the final, clean dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 26099 entries, 20395 to 59113
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   PatientID  26099 non-null  object 
 1   HR         26099 non-null  float64
 2   O2Sat      26099 non-null  float64
 3   Temp       26099 non-null  float64
 4   SBP        26099 non-null  float64
 5   MAP        26099 non-null  float64
 6   DBP        26099 non-null  float64
 7   Resp       26099 non-null  float64
 8   Age        26099 non-null  float64
 9   Gender     26099 non-null  float64
 10  ICULOS     26099 non-null  float64
dtypes: float64(10), object(1)
memory usage: 2.4+ MB

Saving cleaned data to the 'processed' directory...
Data saved to: ..\data\processed\cleaned_sepsis_data.csv
