In [None]:
%pip install tableone



In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import tableone
from google.colab import files
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

%load_ext google.colab.data_table

Authenticated


In [None]:
from google.cloud import bigquery
client = bigquery.Client(project='datathon')

In [None]:
%%bigquery df_vital --project datathon-455912

SELECT *
FROM `physionet-data.eicu_crd_derived.pivoted_vital`
ORDER BY patientunitstayid, chartoffset


Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
# Load CSV file
df_patients_offset = pd.read_csv('/content/patients_offset.csv')

df_patients_offset.head()


Unnamed: 0,patientunitstayid,intakeoutputoffset
0,588607,249
1,854709,416
2,489119,365
3,166017,840
4,1590543,9164


In [None]:
# Load CSV file
df_fio2 = pd.read_csv('/content/final_o2_cleaned.csv')

df_fio2.head()

Unnamed: 0,patientunitstayid,chartoffset,entryoffset,o2_device_group,final_fio2
0,141168,506,513,nasal cannula,28.0
1,142723,1137,1124,ventilator,60.0
2,143325,605,632,ventilator,30.0
3,143325,1685,1713,ventilator,60.0
4,144297,10421,10426,ventilator,60.0


In [None]:
# Step 1: Get list of patients you want from df_patients_offset
patient_list = df_patients_offset['patientunitstayid'].unique()

print(f"✅ Number of patients in df_patients_offset: {len(patient_list)}")

# Step 2: Filter df_vital
df_vital_filtered = df_vital[df_vital['patientunitstayid'].isin(patient_list)].copy()

# Step 3: Filter df_fio2
df_fio2_filtered = df_fio2[df_fio2['patientunitstayid'].isin(patient_list)].copy()

# Step 4: Check
print(f"✅ df_vital_filtered shape: {df_vital_filtered.shape}")
print(f"✅ df_fio2_filtered shape: {df_fio2_filtered.shape}")


✅ Number of patients in df_patients_offset: 8207
✅ df_vital_filtered shape: (1858992, 14)
✅ df_fio2_filtered shape: (293439, 5)


In [None]:
### Fix dtypes
df_vital_filtered['patientunitstayid'] = df_vital_filtered['patientunitstayid'].astype('int64')
df_vital_filtered['chartoffset'] = df_vital_filtered['chartoffset'].astype('int64')

df_fio2_filtered['patientunitstayid'] = df_fio2_filtered['patientunitstayid'].astype('int64')
df_fio2_filtered['chartoffset'] = df_fio2_filtered['chartoffset'].astype('int64')

In [None]:
# Step 0: Force int64 types to avoid dtype mismatch
df_vital_filtered['patientunitstayid'] = df_vital_filtered['patientunitstayid'].astype('int64')
df_vital_filtered['chartoffset'] = df_vital_filtered['chartoffset'].astype('int64')
df_fio2_filtered['patientunitstayid'] = df_fio2_filtered['patientunitstayid'].astype('int64')
df_fio2_filtered['chartoffset'] = df_fio2_filtered['chartoffset'].astype('int64')

# Step 1: Drop any rows with missing patientunitstayid or chartoffset
df_vital_filtered = df_vital_filtered.dropna(subset=['patientunitstayid', 'chartoffset'])
df_fio2_filtered = df_fio2_filtered.dropna(subset=['patientunitstayid', 'chartoffset'])

# Step 2: Get unique patient IDs
patient_ids = df_vital_filtered['patientunitstayid'].unique()

# Initialize an empty list to store results
result_dfs = []

# Iterate through each patient ID
for patient_id in patient_ids:
    # Filter data for this patient
    vital_patient = df_vital_filtered[df_vital_filtered['patientunitstayid'] == patient_id]
    fio2_patient = df_fio2_filtered[df_fio2_filtered['patientunitstayid'] == patient_id]

    # Sort by chartoffset
    vital_patient = vital_patient.sort_values('chartoffset')
    fio2_patient = fio2_patient.sort_values('chartoffset')

    # Now merge for this patient - make sure to keep patientunitstayid
    try:
        merged = pd.merge_asof(
            vital_patient,
            fio2_patient[['chartoffset', 'o2_device_group', 'final_fio2']],  # Remove patientunitstayid here
            on='chartoffset',
            direction='backward',
            allow_exact_matches=True
        )
        result_dfs.append(merged)
    except Exception as e:
        print(f"Error merging patient {patient_id}: {e}")

# Combine all results
if result_dfs:
    df_matched = pd.concat(result_dfs, ignore_index=True)

    # Check if the columns exist before printing
    columns_to_print = []
    for col in ['patientunitstayid', 'chartoffset', 'spo2', 'o2_device_group', 'final_fio2']:
        if col in df_matched.columns:
            columns_to_print.append(col)
        else:
            print(f"Warning: Column '{col}' not found in the merged dataframe")

    print(f"✅ Merge successful! Final shape: {df_matched.shape}")
    print(f"Available columns: {df_matched.columns.tolist()}")

    if columns_to_print:
        print(df_matched[columns_to_print].head(10))
    else:
        print("None of the requested columns are available in the merged dataframe")
else:
    print("❌ No data was merged successfully.")


✅ Merge successful! Final shape: (1858992, 16)
Available columns: ['patientunitstayid', 'chartoffset', 'entryoffset', 'heartrate', 'respiratoryrate', 'spo2', 'nibp_systolic', 'nibp_diastolic', 'nibp_mean', 'temperature', 'temperaturelocation', 'ibp_systolic', 'ibp_diastolic', 'ibp_mean', 'o2_device_group', 'final_fio2']
   patientunitstayid  chartoffset   spo2 o2_device_group  final_fio2
0             141233         -423   98.0             NaN         NaN
1             141233            2  100.0             NaN         NaN
2             141233            4  100.0             NaN         NaN
3             141233           17  100.0             NaN         NaN
4             141233           29  100.0             NaN         NaN
5             141233           32  100.0             NaN         NaN
6             141233           38    NaN             NaN         NaN
7             141233           47   99.0             NaN         NaN
8             141233           62   99.0             NaN 

In [None]:
df_merged



Unnamed: 0,patientunitstayid,chartoffset_vital,entryoffset,heartrate,respiratoryrate,spo2,nibp_systolic,nibp_diastolic,nibp_mean,temperature,temperaturelocation,ibp_systolic,ibp_diastolic,ibp_mean,chartoffset_fio2,o2_device_group,final_fio2
0,141233,-423,-423,89.0,16.0,98.0,92.0,55.0,,36.1,TEMPORAL ARTERY,,,,,,
1,141233,2,2,100.0,12.0,100.0,,,,35.1,PA CATHETER,136.0,44.0,68.0,,,
2,141233,4,4,,,100.0,,,,,,,,,,,
3,141233,17,17,96.0,12.0,100.0,,,,34.8,PA CATHETER,122.0,60.0,88.0,,,
4,141233,29,29,98.0,11.0,100.0,,,,,,130.0,62.0,90.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944738,3352827,30270,30270,73.0,,99.0,146.0,98.0,117.0,,,,,,,,
1944739,3352827,30330,30330,83.0,,99.0,179.0,76.0,109.0,,,,,,,,
1944740,3352827,30390,30390,,,100.0,,,,,,,,,,,
1944741,3352827,30450,30450,,,100.0,,,,,,,,,,,


In [None]:
# Check missingness of FiO2 related columns
missing_fio2 = df_matched['final_fio2'].isna().sum()
total_rows = len(df_matched)
missing_percentage = (missing_fio2 / total_rows) * 100

print(f"Missing FiO2 values: {missing_fio2} out of {total_rows} rows")
print(f"Percentage missing: {missing_percentage:.2f}%")

# You can also check missing oxygen device information
missing_o2_device = df_matched['o2_device_group'].isna().sum()
missing_o2_percentage = (missing_o2_device / total_rows) * 100
print(f"Missing O2 device values: {missing_o2_device} out of {total_rows} rows")
print(f"Percentage missing: {missing_o2_percentage:.2f}%")

# Distribution of non-missing values by O2 device
if 'o2_device_group' in df_matched.columns:
    print("\nDistribution of O2 device groups (non-missing values):")
    print(df_matched['o2_device_group'].value_counts(dropna=True))

Missing FiO2 values: 981458 out of 1858992 rows
Percentage missing: 52.80%
Missing O2 device values: 981458 out of 1858992 rows
Percentage missing: 52.80%

Distribution of O2 device groups (non-missing values):
o2_device_group
nasal cannula    693554
ventilator       132704
room air          19509
mask oxygen       18732
bipap/cpap        10618
hfnc               1517
other               900
Name: count, dtype: int64


In [None]:
# Check the original number of rows
original_rows = len(df_matched)

# Remove rows with missing SpO2 values
df_matched_clean = df_matched.dropna(subset=['spo2'])

# Check how many rows were removed
remaining_rows = len(df_matched_clean)
removed_rows = original_rows - remaining_rows
removed_percentage = (removed_rows / original_rows) * 100

print(f"Original number of rows: {original_rows}")
print(f"Rows with missing SpO2 removed: {removed_rows}")
print(f"Remaining rows: {remaining_rows}")
print(f"Percentage of rows removed: {removed_percentage:.2f}%")

# Update the dataframe
df_matched = df_matched_clean

# Optional: Reset the index after dropping rows
df_matched = df_matched.reset_index(drop=True)

print("\nNew dataframe shape:", df_matched.shape)



Original number of rows: 1858992
Rows with missing SpO2 removed: 411290
Remaining rows: 1447702
Percentage of rows removed: 22.12%

New dataframe shape: (1447702, 16)


In [None]:
original_rows = len(df_matched)

# Merge df_matched with df_patients_offset based on patientunitstayid
df_final = pd.merge(
    df_matched,
    df_patients_offset,
    on='patientunitstayid',
    how='left'  # Use 'left' to keep all rows from df_matched
)

Original rows in df_matched: 1447702
Rows after merging with patient offset data: 1447702
Rows missing intakeoutputoffset data: 0 (0.00%)

Sample of merged data:
   patientunitstayid  chartoffset   spo2  final_fio2  intakeoutputoffset
0             141233         -423   98.0         NaN                 305
1             141233            2  100.0         NaN                 305
2             141233            4  100.0         NaN                 305
3             141233           17  100.0         NaN                 305
4             141233           29  100.0         NaN                 305


In [None]:
### select 12 hr window pre / post tf

# Step 1: Rename intakeoutputoffset to tf_offset
df_final = df_final.rename(columns={'intakeoutputoffset': 'tf_offset'})

# Step 2: Calculate time difference between transfusion and chart time (in hours)
df_final['time_diff'] = (df_final['chartoffset'] - df_final['tf_offset']) / 60  # Converting minutes to hours

# Step 3: Create a flag for measurements within ±12 hours of transfusion
df_final['near_transfusion'] = ((df_final['time_diff'] >= -12) &
                               (df_final['time_diff'] <= 12)).astype(int)

# Step 4: Check how many measurements are near transfusions
transfusion_count = df_final['near_transfusion'].sum()
total_count = len(df_final)
percentage = (transfusion_count / total_count) * 100


# Step 5: Display a sample of the results
print("\nSample of measurements near transfusions:")
near_transfusion_sample = df_final[df_final['near_transfusion'] == 1][
    ['patientunitstayid', 'chartoffset', 'tf_offset', 'time_diff', 'near_transfusion', 'spo2', 'final_fio2']
].head(10)

Total measurements: 1447702
Measurements within ±12 hours of transfusion: 199516 (13.78%)

Sample of measurements near transfusions:
    patientunitstayid  chartoffset  tf_offset  time_diff  near_transfusion  \
1              141233            2        305  -5.050000                 1   
2              141233            4        305  -5.016667                 1   
3              141233           17        305  -4.800000                 1   
4              141233           29        305  -4.600000                 1   
5              141233           32        305  -4.550000                 1   
6              141233           47        305  -4.300000                 1   
7              141233           62        305  -4.050000                 1   
8              141233           77        305  -3.800000                 1   
9              141233           92        305  -3.550000                 1   
10             141233          107        305  -3.300000                 1   

     spo

In [None]:
### make pre/post transfusion

# Select only rows where near_transfusion = 1
df_transfusion_window = df_final[df_final['near_transfusion'] == 1]

# Get the total count of rows near transfusions
total_near_transfusion = len(df_transfusion_window)

# Check missingness of FiO2 in the transfusion window
missing_fio2 = df_transfusion_window['final_fio2'].isna().sum()
missing_percentage = (missing_fio2 / total_near_transfusion) * 100

print(f"Total measurements within ±12 hours of transfusion: {total_near_transfusion}")
print(f"Measurements missing FiO2 data: {missing_fio2}")
print(f"Percentage of measurements missing FiO2: {missing_percentage:.2f}%")

# You might also want to check distribution of SpO2 values in this subset
print("\nSpO2 statistics in transfusion window:")
print(df_transfusion_window['spo2'].describe())

# Check O2 device distribution in the transfusion window (if available)
if 'o2_device_group' in df_transfusion_window.columns:
    print("\nO2 device distribution in transfusion window:")
    print(df_transfusion_window['o2_device_group'].value_counts(dropna=True))
    print(f"Missing O2 device values: {df_transfusion_window['o2_device_group'].isna().sum()}")

Total measurements within ±12 hours of transfusion: 199516
Measurements missing FiO2 data: 110198
Percentage of measurements missing FiO2: 55.23%

SpO2 statistics in transfusion window:
count    199516.000000
mean         97.184040
std           3.874865
min           0.000000
25%          96.000000
50%          98.000000
75%         100.000000
max         100.000000
Name: spo2, dtype: float64

O2 device distribution in transfusion window:
o2_device_group
nasal cannula    71868
ventilator       11617
room air          3096
mask oxygen       1328
bipap/cpap        1151
hfnc               145
other              113
Name: count, dtype: int64
Missing O2 device values: 110198


In [None]:
# Calculate time difference in hours (already done previously)
# df_final['time_diff'] = (df_final['chartoffset'] - df_final['tf_offset']) / 60

# Create a pre/post transfusion label
df_final['prepost_tf'] = 'not_near_tf'  # Default value for measurements outside the window

# Label measurements within the ±12 hour window as pre or post
df_final.loc[(df_final['time_diff'] < 0) & (df_final['time_diff'] >= -12), 'prepost_tf'] = 'pre_tf'
df_final.loc[(df_final['time_diff'] >= 0) & (df_final['time_diff'] <= 12), 'prepost_tf'] = 'post_tf'


Pre-transfusion measurements (-12 to 0 hours): 96814 (6.69%)
Post-transfusion measurements (0 to +12 hours): 102702 (7.09%)
Measurements outside transfusion window: 1248186 (86.22%)

Sample of data with prepost_tf labels:
         patientunitstayid  chartoffset  tf_offset   time_diff   prepost_tf  \
1053108            2489930         2043        804   20.650000  not_near_tf   
1048384            2484275         1353      12659 -188.433333  not_near_tf   
111179              184660         1163       4254  -51.516667  not_near_tf   
179754              213557         2820       2880   -1.000000       pre_tf   
1412576            3243725         5191       2791   40.000000  not_near_tf   
776208             1661327         -217       3808  -67.083333  not_near_tf   
516476              998693         7880       5060   47.000000  not_near_tf   
962091             2346930         3054      14495 -190.683333  not_near_tf   
20123               150092         4166       9813  -94.116667  not

In [None]:
df_transfusion_window = df_final[df_final['near_transfusion'] == 1].copy()

In [None]:
df_transfusion_window.to_csv('sao2_added_raw.csv', index=False)

In [None]:
# Count pre and post transfusion records per patient
patient_prepost_counts = df_transfusion_window.groupby(['patientunitstayid', 'prepost_tf']).size().unstack(fill_value=0)

# If the columns don't exist, create them
if 'pre_tf' not in patient_prepost_counts.columns:
    patient_prepost_counts['pre_tf'] = 0
if 'post_tf' not in patient_prepost_counts.columns:
    patient_prepost_counts['post_tf'] = 0

# Count patients with zero pre-transfusion records
zero_pre_patients = (patient_prepost_counts['pre_tf'] == 0).sum()
total_patients = len(patient_prepost_counts)
percentage_zero_pre = (zero_pre_patients / total_patients) * 100

# Count patients with fewer than 3 post-transfusion records (0, 1, or 2)
few_post_patients = (patient_prepost_counts['post_tf'] < 3).sum()
percentage_few_post = (few_post_patients / total_patients) * 100

# Count patients with both zero pre-records AND fewer than 3 post-records
both_criteria_patients = ((patient_prepost_counts['pre_tf'] == 0) &
                          (patient_prepost_counts['post_tf'] < 3)).sum()
percentage_both = (both_criteria_patients / total_patients) * 100

Total patients in transfusion window: 7370
Patients with 0 pre-transfusion records: 214 (2.90%)
Patients with <3 post-transfusion records: 1473 (19.99%)
Patients with both 0 pre AND <3 post records: 101 (1.37%)

Breakdown of post-transfusion record counts:
  Patients with 0 post-transfusion records: 321 (4.36%)
  Patients with 1 post-transfusion records: 637 (8.64%)
  Patients with 2 post-transfusion records: 515 (6.99%)


In [None]:
# Count pre and post transfusion records per patient
patient_prepost_counts = df_transfusion_window.groupby(['patientunitstayid', 'prepost_tf']).size().unstack(fill_value=0)

# Ensure both columns exist
if 'pre_tf' not in patient_prepost_counts.columns:
    patient_prepost_counts['pre_tf'] = 0
if 'post_tf' not in patient_prepost_counts.columns:
    patient_prepost_counts['post_tf'] = 0

# Identify patients to keep - those with at least 1 pre-tf record AND at least 3 post-tf records
patients_to_keep = patient_prepost_counts[
    (patient_prepost_counts['pre_tf'] > 0) &
    (patient_prepost_counts['post_tf'] >= 3)
].index

# Get the original count before filtering
original_patient_count = len(patient_prepost_counts)
kept_patient_count = len(patients_to_keep)
removed_patient_count = original_patient_count - kept_patient_count

# Filter the dataset to keep only the selected patients
df_transfusion_filtered = df_transfusion_window[
    df_transfusion_window['patientunitstayid'].isin(patients_to_keep)
]

# Get counts before and after filtering
original_records = len(df_transfusion_window)
filtered_records = len(df_transfusion_filtered)
removed_records = original_records - filtered_records

# Update the working dataframe
df_transfusion_window = df_transfusion_filtered

Original patients: 5784
Patients removed: 0 (0.00%)
Patients kept: 5784 (100.00%)

Original records: 191679
Records removed: 0 (0.00%)
Records kept: 191679 (100.00%)


In [None]:
df_transfusion_filtered.to_csv('sao2_added.csv', index=False)

In [None]:
# Make a copy of the dataset to work with
df_imputed = df_transfusion_filtered.copy()

# First, ensure the data is sorted by patient and time
df_imputed = df_imputed.sort_values(['patientunitstayid', 'chartoffset'])

# Define a function to forward-fill values within each patient group
def impute_per_patient(group):
    # Forward fill within the patient group
    group['final_fio2'] = group['final_fio2'].ffill()
    group['o2_device_group'] = group['o2_device_group'].ffill()
    return group

# Apply the function to each patient group
df_imputed = df_imputed.groupby('patientunitstayid', group_keys=False).apply(impute_per_patient)

# Check how many values are still missing after imputation
fio2_still_missing = df_imputed['final_fio2'].isna().sum()
o2_device_still_missing = df_imputed['o2_device_group'].isna().sum()
total_records = len(df_imputed)

print(f"Total records: {total_records}")
print(f"Records with FiO2 still missing after imputation: {fio2_still_missing} ({fio2_still_missing/total_records*100:.2f}%)")
print(f"Records with O2 device still missing after imputation: {o2_device_still_missing} ({o2_device_still_missing/total_records*100:.2f}%)")

# Count how many values were imputed
fio2_imputed = df_transfusion_filtered['final_fio2'].isna().sum() - fio2_still_missing
o2_device_imputed = df_transfusion_filtered['o2_device_group'].isna().sum() - o2_device_still_missing

print(f"\nFiO2 values imputed: {fio2_imputed}")
print(f"O2 device values imputed: {o2_device_imputed}")

# Update our working dataframe
df_transfusion_filtered = df_imputed

Total records: 90081
Records with FiO2 still missing after imputation: 5483 (6.09%)
Records with O2 device still missing after imputation: 5483 (6.09%)

FiO2 values imputed: 0
O2 device values imputed: 0


  df_imputed = df_imputed.groupby('patientunitstayid', group_keys=False).apply(impute_per_patient)


In [None]:
df_transfusion_filtered.to_csv('sf.csv', index=False)

In [None]:
df_sf = pd.read_csv('/content/sfratio.csv')

In [None]:
##### Calculation of s/f ratio

# Group by patient ID and pre/post transfusion status, then calculate average SF ratio
sf_averages = df_sf.groupby(['patientunitstayid', 'prepost_tf'])['sf_ratio'].mean().reset_index()

# Reshape the data to have pre and post values in separate columns
sf_pivot = sf_averages.pivot(index='patientunitstayid',
                            columns='prepost_tf',
                            values='sf_ratio')

# Rename the columns for clarity
sf_pivot = sf_pivot.rename(columns={'pre_tf': 'avg_sf_pre',
                                    'post_tf': 'avg_sf_post'})

# Add a column for the change in SF ratio
sf_pivot['sf_change'] = sf_pivot['avg_sf_post'] - sf_pivot['avg_sf_pre']

# Reset index to make patientunitstayid a regular column
sf_pivot = sf_pivot.reset_index()

# Display the first few rows of results
print(f"Average SF ratios calculated for {len(sf_pivot)} patients")
print(sf_pivot.head())

# Calculate overall statistics
print("\nOverall statistics:")
print(sf_pivot[['avg_sf_pre', 'avg_sf_post', 'sf_change']].describe())

# Count patients with improved/worsened SF ratio
improved = (sf_pivot['sf_change'] > 0).sum()
worsened = (sf_pivot['sf_change'] < 0).sum()
unchanged = (sf_pivot['sf_change'] == 0).sum()
total = len(sf_pivot)

print(f"\nPatients with improved SF ratio: {improved} ({improved/total*100:.2f}%)")
print(f"Patients with worsened SF ratio: {worsened} ({worsened/total*100:.2f}%)")
print(f"Patients with unchanged SF ratio: {unchanged} ({unchanged/total*100:.2f}%)")

Average SF ratios calculated for 2938 patients
prepost_tf  patientunitstayid  avg_sf_post  avg_sf_pre  sf_change
0                      144297   164.333333  165.095238  -0.761905
1                      145715   138.104956  138.979592  -0.874636
2                      163555   197.803922  200.000000  -2.196078
3                      168097   297.115385  302.083333  -4.967949
4                      171083   294.940476  303.613281  -8.672805

Overall statistics:
prepost_tf   avg_sf_pre  avg_sf_post    sf_change
count       2938.000000  2938.000000  2938.000000
mean         278.575762   280.497436     1.921674
std           80.469729    80.563803    34.180245
min           80.400000    74.571429  -280.065476
25%          199.847578   199.876838    -4.569072
50%          275.308536   277.222222    -0.082581
75%          344.780220   345.350275     5.428115
max          476.190476   476.190476   261.555556

Patients with improved SF ratio: 1383 (47.07%)
Patients with worsened SF ratio: 1486 

In [None]:
###### Calculation of AUC

# Function to calculate AUC using trapezoidal rule
def calculate_auc(group):
    # Sort by time
    group = group.sort_values('chartoffset')

    # Convert chartoffset to hours from transfusion
    group['hours_from_tf'] = (group['chartoffset'] - group['tf_offset']) / 60

    # Keep only the post-transfusion period (0 to 12 hours)
    post_tf = group[(group['hours_from_tf'] >= 0) & (group['hours_from_tf'] <= 12)]

    if len(post_tf) < 2:
        # Need at least 2 points for AUC
        return np.nan

    # Calculate AUC using trapezoidal rule
    auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)

    # If we want the average value (AUC divided by time span)
    # This makes it comparable across patients with different numbers of measurements
    time_span = post_tf['hours_from_tf'].max() - post_tf['hours_from_tf'].min()
    if time_span > 0:
        avg_auc = auc / time_span
    else:
        avg_auc = np.nan

    return auc, avg_auc, time_span

# Apply the function to each patient
results = []
for patient_id, group in df_sf.groupby('patientunitstayid'):
    try:
        auc_result = calculate_auc(group)
        if isinstance(auc_result, tuple):
            auc, avg_auc, time_span = auc_result
            results.append({
                'patientunitstayid': patient_id,
                'sf_auc': auc,
                'sf_avg_auc': avg_auc,
                'time_span': time_span
            })
        else:
            # Handle case where function returns just NaN
            results.append({
                'patientunitstayid': patient_id,
                'sf_auc': np.nan,
                'sf_avg_auc': np.nan,
                'time_span': np.nan
            })
    except Exception as e:
        print(f"Error calculating AUC for patient {patient_id}: {e}")
        results.append({
            'patientunitstayid': patient_id,
            'sf_auc': np.nan,
            'sf_avg_auc': np.nan,
            'time_span': np.nan
        })

# Create a dataframe with the results
auc_df = pd.DataFrame(results)

# Merge with the previous SF averages
sf_combined = pd.merge(sf_pivot, auc_df, on='patientunitstayid', how='left')

# Display the results
print(f"AUC calculated for {auc_df['sf_auc'].notna().sum()} out of {len(auc_df)} patients")
print(sf_combined.head())

# Statistics on AUC values
print("\nAUC statistics:")
print(sf_combined[['sf_auc', 'sf_avg_auc', 'time_span']].describe())

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.tra

AUC calculated for 2938 out of 2938 patients
   patientunitstayid  avg_sf_post  avg_sf_pre  sf_change       sf_auc  \
0             144297   164.333333  165.095238  -0.761905  1957.486111   
1             145715   138.104956  138.979592  -0.874636  1631.702381   
2             163555   197.803922  200.000000  -2.196078  2373.733333   
3             168097   297.115385  302.083333  -4.967949  3268.515625   
4             171083   294.940476  303.613281  -8.672805  2852.812500   

   sf_avg_auc  time_span  
0  164.725339  11.883333  
1  137.890342  11.833333  
2  197.811111  12.000000  
3  297.137784  11.000000  
4  292.596154   9.750000  

AUC statistics:
            sf_auc   sf_avg_auc    time_span
count  2938.000000  2938.000000  2938.000000
mean   3003.179008   280.741581    10.733974
std    1029.241700    80.850946     2.095018
min      39.564732    80.500000     0.133333
25%    2323.000000   199.901042    10.600000
50%    2934.305556   276.932070    11.508333
75%    3775.937500   3

  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from_tf'].values)
  auc = np.trapz(post_tf['sf_ratio'].values, post_tf['hours_from

In [None]:
import numpy as np

# Function to calculate AUC or average*12 for pre-transfusion period
def calculate_pre_auc(group):
    # Convert chartoffset to hours from transfusion
    group['hours_from_tf'] = (group['chartoffset'] - group['tf_offset']) / 60

    # Filter for pre-transfusion data (-12 to 0 hours)
    pre_tf = group[(group['hours_from_tf'] >= -12) & (group['hours_from_tf'] < 0)]

    # If we have less than 3 measurements, use average * 12
    if len(pre_tf) < 3:
        if len(pre_tf) == 0:
            return np.nan, np.nan, 0, "no_data"

        avg_sf = pre_tf['sf_ratio'].mean()
        return avg_sf * 12, avg_sf, len(pre_tf), "avg_method"

    # With 3+ measurements, calculate proper AUC
    pre_tf = pre_tf.sort_values('hours_from_tf')

    # Calculate AUC using trapezoidal rule
    # Take absolute value since hours are negative
    auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))

    # Calculate average SF during the period
    avg_sf = pre_tf['sf_ratio'].mean()

    return auc, avg_sf, len(pre_tf), "auc_method"

# Apply the function to each patient
pre_results = []
for patient_id, group in df_sf.groupby('patientunitstayid'):
    try:
        pre_auc, pre_avg, pre_count, pre_method = calculate_pre_auc(group)

        pre_results.append({
            'patientunitstayid': patient_id,
            'pre_sf_auc': pre_auc,
            'pre_sf_avg': pre_avg,
            'pre_count': pre_count,
            'pre_method': pre_method
        })
    except Exception as e:
        print(f"Error processing patient {patient_id}: {e}")
        pre_results.append({
            'patientunitstayid': patient_id,
            'pre_sf_auc': np.nan,
            'pre_sf_avg': np.nan,
            'pre_count': 0,
            'pre_method': "error"
        })

# Create a dataframe with the pre-transfusion results
pre_auc_df = pd.DataFrame(pre_results)

# Merge with your existing combined dataframe that has post-transfusion data
sf_combined_with_pre = pd.merge(sf_combined, pre_auc_df, on='patientunitstayid', how='left')

# Display statistics on the pre-transfusion AUC
print(f"Pre-transfusion AUC calculated for {pre_auc_df['pre_sf_auc'].notna().sum()} out of {len(pre_auc_df)} patients")
print("\nMethod used for pre-transfusion:")
print(pre_auc_df['pre_method'].value_counts())

print("\nPre-transfusion statistics:")
print(pre_auc_df[['pre_sf_auc', 'pre_sf_avg']].describe())

# If you want to calculate the change between pre and post AUC
# First make sure the column names match what's in your existing data
if 'sf_auc' in sf_combined.columns:  # assuming this is your post-transfusion AUC
    sf_combined_with_pre['auc_change'] = sf_combined_with_pre['sf_auc'] - sf_combined_with_pre['pre_sf_auc']

    # Count patients with improved/worsened SF AUC
    valid_both_auc = sf_combined_with_pre[sf_combined_with_pre['auc_change'].notna()]
    improved_auc = (valid_both_auc['auc_change'] > 0).sum()
    worsened_auc = (valid_both_auc['auc_change'] < 0).sum()
    unchanged_auc = (valid_both_auc['auc_change'] == 0).sum()
    total_auc = len(valid_both_auc)

    print(f"\nAUC Change analysis (for patients with both pre and post data):")
    print(f"Patients with improved SF AUC: {improved_auc} ({improved_auc/total_auc*100:.2f}%)")
    print(f"Patients with worsened SF AUC: {worsened_auc} ({worsened_auc/total_auc*100:.2f}%)")
    print(f"Patients with unchanged SF AUC: {unchanged_auc} ({unchanged_auc/total_auc*100:.2f}%)")

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hou

Pre-transfusion AUC calculated for 2938 out of 2938 patients

Method used for pre-transfusion:
pre_method
auc_method    2727
avg_method     211
Name: count, dtype: int64

Pre-transfusion statistics:
        pre_sf_auc   pre_sf_avg
count  2938.000000  2938.000000
mean   2637.672135   278.575762
std    1114.048313    80.469729
min      33.333333    80.400000
25%    2032.172321   199.847578
50%    2583.598214   275.308536
75%    3462.096974   344.780220
max    5714.285714   476.190476

AUC Change analysis (for patients with both pre and post data):
Patients with improved SF AUC: 2008 (68.35%)
Patients with worsened SF AUC: 926 (31.52%)
Patients with unchanged SF AUC: 4 (0.14%)


  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf['sf_ratio'].values, pre_tf['hours_from_tf'].values))
  auc = abs(np.trapz(pre_tf[

In [None]:
sf_combined_with_pre.to_csv('final_avg_auc.csv', index=False)