In [4]:
import pandas as pd

# Load the data
file_path = '/root/DATA/filtered_merged_data.csv'
data = pd.read_csv(file_path, low_memory=False)

# Data type conversion
data['intime_x'] = pd.to_datetime(data['intime_x'], errors='coerce')  # Admission time
data['outtime_x'] = pd.to_datetime(data['outtime_x'], errors='coerce')  # Discharge time
data['age_years'] = pd.to_numeric(data['age_years'], errors='coerce')  # Age
data['expire_flag'] = pd.to_numeric(data['expire_flag'], errors='coerce')  # Survival status

# Basic statistics
missing_summary = data.isnull().sum() / len(data) * 100
print("Missing values percentage for each column:")
print(missing_summary)

# Demographics
median_age = data['age_years'].median()
iqr_age = data['age_years'].quantile([0.25, 0.75]).tolist()
print(f"\nMedian Age: {median_age}")
print(f"IQR Age: {iqr_age}")

# Length of stay statistics
median_los = data['los_x'].median()
print(f"\nMedian Length of Stay (LOS): {median_los} days")

# Admission type distribution
admission_type_distribution = data['admission_type'].value_counts(normalize=True) * 100
print("\nAdmission Type Distribution:")
print(admission_type_distribution)

# Admission location distribution
admission_location_distribution = data['admission_location'].value_counts(normalize=True) * 100
print("\nAdmission Location Distribution:")
print(admission_location_distribution)

# Diagnosis distribution
diagnosis_distribution = data['diagnosis'].value_counts(normalize=True) * 100
print("\nDiagnosis Distribution:")
print(diagnosis_distribution)

# Laboratory data statistics
lab_columns = ['lactate', 'bloodureanitrogen', 'hemoglobin']
lab_stats = {}
for col in lab_columns:
    lab_stats[col] = {
        "Median": data[col].median(),
        "IQR": data[col].quantile([0.25, 0.75]).tolist()
    }
print("\nLaboratory Data Statistics:")
print(pd.DataFrame(lab_stats))

# Weekend admission analysis
data['is_weekend_admission'] = data['intime_x'].dt.weekday >= 5  # Mark weekend admissions
weekend_distribution = data['is_weekend_admission'].value_counts(normalize=True) * 100
print("\nWeekend Admission Distribution:")
print(weekend_distribution)


Missing values percentage for each column:
subject_id_x               0.000000
row_id_x                   0.000000
hadm_id_x                  0.000000
icustay_id                 0.000000
dbsource                   0.000000
                            ...    
icu_expire_flag            0.000000
hospital_expire_flag_y    26.221404
dod                       47.643044
expire_flag                0.000000
ttd_days                  47.643044
Length: 82, dtype: float64

Median Age: 64.0
IQR Age: [51.0, 77.0]

Median Length of Stay (LOS): 4.557 days

Admission Type Distribution:
admission_type
EMERGENCY    89.044450
ELECTIVE      8.023961
URGENT        2.931589
Name: proportion, dtype: float64

Admission Location Distribution:
admission_location
EMERGENCY ROOM ADMIT         45.284046
TRANSFER FROM HOSP/EXTRAM    26.424246
CLINIC REFERRAL/PREMATURE    17.055460
PHYS REFERRAL/NORMAL DELI    10.331917
TRANSFER FROM SKILLED NUR     0.818988
TRANSFER FROM OTHER HEALT     0.075849
** INFO NOT AVAILAB