In [1]:
import pandas as pd

# 1. Load the data
filtered_data = pd.read_csv('/root/DATA/final_selected_clinical_variables.csv', low_memory=False)
print(f"Initial data shape: {filtered_data.shape}")

# 2. Filter for the first ICU admission record per patient
filtered_data = (
    filtered_data.sort_values('icustay_id')
    .groupby('subject_id')
    .first()
    .reset_index()
)
print(f"Shape after filtering first ICU admissions: {filtered_data.shape}")

# 3. Remove invalid or abnormal samples
# Assume 'age_years' is valid only within the range 0-120 years
filtered_data = filtered_data[(filtered_data['age_years'] >= 0) & (filtered_data['age_years'] <= 120)]
print(f"Shape after removing abnormal age records: {filtered_data.shape}")

# 4. Handle extreme values: cap numerical variables at 1st and 99th percentiles
def cap_extreme_values(df, column):
    lower_bound = df[column].quantile(0.01)
    upper_bound = df[column].quantile(0.99)
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

numerical_cols = [
    'systolic_bp', 'diastolic_bp', 'mean_bp', 'resp_rate', 'heart_rate',
    'temperature', 'spo2', 'glucose', 'creatinine', 'bloodureanitrogen',
    'lactate', 'whitebloodcell', 'platelets', 'bilirubin', 'albumin'
]

for col in numerical_cols:
    if col in filtered_data.columns:
        cap_extreme_values(filtered_data, col)

print(f"Shape after handling extreme values: {filtered_data.shape}")

# 5. Check the distribution of the target variable (expire_flag)
print("\nDistribution of the target variable (expire_flag):")
print(filtered_data['expire_flag'].value_counts())

# 6. Export the cleaned dataset to a CSV file
output_path = '/root/DATA/cleaned_final_dataset.csv'
filtered_data.to_csv(output_path, index=False)
print(f"The final cleaned dataset has been exported to {output_path}")


Initial data shape: (2938748, 18)
Shape after filtering first ICU admissions: (20044, 18)
Shape after removing abnormal age records: (20044, 18)
Shape after handling extreme values: (20044, 18)

Distribution of the target variable (expire_flag):
expire_flag
0    11985
1     8059
Name: count, dtype: int64
The final cleaned dataset has been exported to /root/DATA/cleaned_final_dataset.csv
