In [2]:
import pandas as pd

# 1. Load the data
merged_data = pd.read_csv('/root/DATA/filtered_merged_data.csv', low_memory=False)
print(f"Loaded merged_data shape: {merged_data.shape}")

# 2. Initial filtering of variables (removing irrelevant columns)
columns_to_keep = [
    'icustay_id', 'subject_id', 'hadm_id', 'age_years', 'expire_flag',
    'systolic_bp', 'diastolic_bp', 'mean_bp', 'resp_rate',
    'heart_rate', 'temperature', 'spo2', 'glucose',
    'gcseyes', 'gcsmotor', 'gcsverbal', 'creatinine', 'bloodureanitrogen',
    'lactate', 'whitebloodcell', 'platelets', 'bilirubin', 'albumin',
    'admission_type', 'admission_location'
]

# 3. Select only available columns in the dataset
available_columns = [col for col in columns_to_keep if col in merged_data.columns]
filtered_data = merged_data[available_columns].copy()  # Use copy to avoid chained assignment warning
print(f"Shape after initial filtering: {filtered_data.shape}")

# 4. Fill NaN values with reasonable defaults
for col in ['gcseyes', 'gcsmotor', 'gcsverbal']:
    if col in filtered_data.columns:
        filtered_data.loc[:, col] = filtered_data[col].fillna(0)  # GCS scores: fill NaNs with 0

numerical_cols = [
    'systolic_bp', 'diastolic_bp', 'mean_bp', 'resp_rate', 'heart_rate',
    'temperature', 'spo2', 'glucose', 'creatinine', 'bloodureanitrogen',
    'lactate', 'whitebloodcell', 'platelets', 'bilirubin', 'albumin'
]
for col in numerical_cols:
    if col in filtered_data.columns:
        # Fill NaNs in numerical columns with the median value of the column
        filtered_data.loc[:, col] = filtered_data[col].fillna(filtered_data[col].median())

# Fill categorical variables with 'Unknown'
for col in ['admission_type', 'admission_location']:
    if col in filtered_data.columns:
        filtered_data.loc[:, col] = filtered_data[col].fillna('Unknown')

# 5. Check the statistics of NaN values after processing
print("NaN statistics after processing:")
print(filtered_data.isna().sum())

# 6. Export the processed data to a CSV file
output_path = '/root/DATA/final_selected_clinical_variables.csv'
filtered_data.to_csv(output_path, index=False)
print(f"The processed data has been exported to {output_path}")


Loaded merged_data shape: (2938748, 82)
Shape after initial filtering: (2938748, 18)
NaN statistics after processing:
icustay_id            0
subject_id            0
hadm_id               0
age_years             0
expire_flag           0
glucose               0
gcseyes               0
gcsmotor              0
gcsverbal             0
creatinine            0
bloodureanitrogen     0
lactate               0
whitebloodcell        0
platelets             0
bilirubin             0
albumin               0
admission_type        0
admission_location    0
dtype: int64
The processed data has been exported to /root/DATA/final_selected_clinical_variables.csv
