In [1]:
import pandas as pd
import os
from sklearn.ensemble import IsolationForest

# --- Part 1: Load the Cleaned Data ---
print("Loading the processed data...")
processed_data_path = os.path.join('..', 'data', 'processed', 'cleaned_sepsis_data.csv')
df = pd.read_csv(processed_data_path)

# --- Part 2: Prepare Data for Modeling ---
# The model needs numerical data. We'll drop the PatientID for training.
# We'll keep it to link back the results later.
patient_ids = df['PatientID']
model_features = df.drop('PatientID', axis=1)

# --- Part 3: Train the Isolation Forest Model ---
print("Training the Isolation Forest model...")

# Initialize the model. 'contamination' is an estimate of the percentage
# of anomalies in the data. Let's start with 1%.
# 'n_jobs=-1' uses all available CPU cores to speed up training.
iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42, n_jobs=-1)

# Fit the model to the data
iso_forest.fit(model_features)

# --- Part 4: Get Anomaly Scores ---
# The model.decision_function gives a score. The lower the score, the more anomalous.
# The model.predict gives a label: -1 for anomaly, 1 for normal.
print("Calculating anomaly scores...")
anomaly_scores = iso_forest.decision_function(model_features)
anomaly_labels = iso_forest.predict(model_features)

# Add the scores and labels back to our original dataframe
df['anomaly_score'] = anomaly_scores
df['anomaly_label'] = anomaly_labels

print("\nTop 10 most anomalous data points found by the model:")
# Sort by the score (ascending) to see the biggest anomalies first
display(df.sort_values('anomaly_score').head(10))

print(f"\nTotal anomalies found: { (df['anomaly_label'] == -1).sum() } data points")

Loading the processed data...
Training the Isolation Forest model...
Calculating anomaly scores...

Top 10 most anomalous data points found by the model:


Unnamed: 0,PatientID,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,Age,Gender,ICULOS,anomaly_score,anomaly_label
425,p000009,138.0,96.0,39.33,142.0,108.0,87.0,30.5,27.92,1.0,241.0,-0.087578,-1
19414,p000762,132.0,95.0,37.89,188.5,130.0,92.0,25.0,53.2,0.0,137.0,-0.073214,-1
8593,p000324,120.0,90.0,35.17,161.0,126.0,106.0,33.0,58.64,1.0,3.0,-0.071587,-1
423,p000009,133.0,94.0,36.28,141.0,106.0,86.0,33.0,27.92,1.0,239.0,-0.071582,-1
426,p000009,140.0,96.0,39.33,138.0,102.0,83.0,32.0,27.92,1.0,242.0,-0.07132,-1
424,p000009,137.0,94.0,36.28,142.0,106.0,87.0,30.0,27.92,1.0,240.0,-0.069194,-1
15402,p000598,135.0,89.0,39.6,49.0,30.5,27.0,23.5,46.79,1.0,18.0,-0.068781,-1
15403,p000598,135.0,89.0,39.6,49.0,30.5,27.0,23.5,46.79,1.0,19.0,-0.068781,-1
18461,p000733,132.0,95.0,36.83,185.0,130.0,105.0,49.0,57.44,1.0,23.0,-0.068127,-1
8959,p000343,84.0,100.0,37.11,203.0,139.0,135.0,32.0,24.1,0.0,8.0,-0.066161,-1



Total anomalies found: 261 data points


In [5]:
# --- Part 5: Analyze Anomalies at the Patient Level ---

# Group by patient and calculate some summary statistics for the anomaly scores
patient_anomaly_summary = df.groupby('PatientID').agg(
    mean_anomaly_score=('anomaly_score', 'mean'),
    min_anomaly_score=('anomaly_score', 'min'),
    num_anomalous_hours=('anomaly_label', lambda x: (x == -1).sum())
).sort_values('mean_anomaly_score')

print("Top 10 most anomalous patients (based on average hourly score):")
display(patient_anomaly_summary.head(10))


# --- Part 6: Validate Against Sepsis Labels (FINAL CORRECTED VERSION) ---

# To get the original sepsis labels, we must re-load the raw data.
print("\nLoading raw data to get ground truth sepsis labels...")
data_dir = os.path.join('..', 'data', 'raw', 'physionet.org', 'files', 'challenge-2019', '1.0.0', 'training', 'training_setA')
num_files_to_load = 1000 # Make sure this matches the number from the first notebook
sample_files = os.listdir(data_dir)[:num_files_to_load]

list_of_dfs = []
# Loop through the files and add the PatientID from the filename
for filename in sample_files:
    patient_id = filename.split('.')[0]
    full_path = os.path.join(data_dir, filename)
    temp_df = pd.read_csv(full_path, sep='|')
    temp_df['PatientID'] = patient_id # THIS IS THE CRITICAL MISSING STEP
    list_of_dfs.append(temp_df)

full_df = pd.concat(list_of_dfs, ignore_index=True)

# Now 'full_df' is defined AND has the 'PatientID' column, so the groupby will work.
sepsis_labels_per_patient = full_df.groupby('PatientID')['SepsisLabel'].max()

# Merge this ground truth back into our patient summary
patient_summary_with_truth = patient_anomaly_summary.merge(
    sepsis_labels_per_patient,
    left_index=True,
    right_index=True
)

# Display the summary for the top 20 most anomalous patients
print("\nSepsis status for the Top 20 most anomalous patients:")
display(patient_summary_with_truth.head(20))

# Calculate what percentage of the top 50 anomalous patients actually had sepsis
top_50_anomalous = patient_summary_with_truth.head(50)
sepsis_in_top_50 = top_50_anomalous['SepsisLabel'].sum()
percentage_with_sepsis = (sepsis_in_top_50 / 50) * 100

print(f"\nAmong the top 50 most anomalous patients, {percentage_with_sepsis:.2f}% actually developed sepsis.")

Top 10 most anomalous patients (based on average hourly score):


Unnamed: 0_level_0,mean_anomaly_score,min_anomaly_score,num_anomalous_hours
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
p000120,-0.006251,-0.055737,23
p000686,0.010315,-0.0512,22
p000343,0.012845,-0.066161,15
p000114,0.02834,-0.045243,7
p000324,0.04229,-0.071587,12
p000852,0.047458,-0.048041,5
p000219,0.052072,-0.014801,1
p000598,0.052738,-0.068781,3
p000105,0.053664,-0.025256,12
p000181,0.055959,-0.027315,3



Loading raw data to get ground truth sepsis labels...

Sepsis status for the Top 20 most anomalous patients:


Unnamed: 0_level_0,mean_anomaly_score,min_anomaly_score,num_anomalous_hours,SepsisLabel
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
p000120,-0.006251,-0.055737,23,0.0
p000686,0.010315,-0.0512,22,0.0
p000343,0.012845,-0.066161,15,0.0
p000114,0.02834,-0.045243,7,0.0
p000324,0.04229,-0.071587,12,1.0
p000852,0.047458,-0.048041,5,0.0
p000219,0.052072,-0.014801,1,0.0
p000598,0.052738,-0.068781,3,0.0
p000105,0.053664,-0.025256,12,0.0
p000181,0.055959,-0.027315,3,0.0



Among the top 50 most anomalous patients, 22.00% actually developed sepsis.
