In [1]:
import pandas as pd

# Load the cleaned dataset
data_path = '/root/DATA/cleaned_cox_dataset.csv'
data = pd.read_csv(data_path)

# Display initial NaN Value Statistics
nan_summary = data.isna().sum()
nan_summary = nan_summary[nan_summary > 0]
print("Initial NaN Value Statistics:")
print(nan_summary)

# Fill NaN values for specific columns with their median
data['gcsmotor'] = data['gcsmotor'].fillna(data['gcsmotor'].median())
data['lactate'] = data['lactate'].fillna(data['lactate'].median())
data['hemoglobin'] = data['hemoglobin'].fillna(data['hemoglobin'].median())
data['albumin'] = data['albumin'].fillna(data['albumin'].median())
data['gcsverbal'] = data['gcsverbal'].fillna(data['gcsverbal'].median())

# Re-check for remaining NaN values
nan_summary = data.isna().sum()
nan_summary = nan_summary[nan_summary > 0]
print("\nNaN Value Statistics After Filling:")
print(nan_summary)

# Display the final shape and column names
print("\nFinal dataset shape:", data.shape)
print("\nColumn names:")
print(data.columns.tolist())

# Save the final cleaned dataset
output_path = '/root/DATA/cleaned_final_cox_dataset.csv'
data.to_csv(output_path, index=False)
print(f"\nThe final data has been saved to {output_path}")


Initial NaN Value Statistics:
gcsmotor         75
lactate       15812
hemoglobin     9493
albumin       18060
gcsverbal        67
dtype: int64

NaN Value Statistics After Filling:
Series([], dtype: int64)

Final dataset shape: (20101, 21)

Column names:
['los_x', 'positiveculture', 'gcs', 'gcsmotor', 'lactate', 'bloodureanitrogen', 'hemoglobin', 'intnormalisedratio', 'albumin', 'chloride', 'hematocrit', 'age_years', 'insurance_Medicare', 'gcsverbal', 'admission_location_EMERGENCY ROOM ADMIT', 'admission_location_PHYS REFERRAL/NORMAL DELI', 'admission_location_CLINIC REFERRAL/PREMATURE', 'admission_location_TRANSFER FROM HOSP/EXTRAM', 'admission_type_URGENT', 'is_weekend_admission', 'expire_flag']

The final data has been saved to /root/DATA/cleaned_final_cox_dataset.csv


In [1]:
import pandas as pd
from prettytable import PrettyTable

# Load datasets (before and after preprocessing)
data_before_path = '/root/DATA/filtered_merged_data.csv'  # Dataset before preprocessing
data_after_path = '/root/DATA/cleaned_final_cox_dataset.csv'  # Dataset after preprocessing
data_before = pd.read_csv(data_before_path, low_memory=False)
data_after = pd.read_csv(data_after_path, low_memory=False)

# Initialize comparison table
comparison_table = PrettyTable()
comparison_table.field_names = ["Original Variable", "Modified Variable", 
                                "N (Before)", "% (Before)", "N (After)", "% (After)", 
                                "Mean (SD) Before", "Mean (SD) After"]

# Define variables
categorical_vars = ['insurance', 'admission_location', 'admission_type', 'is_weekend_admission', 'expire_flag']
dummy_vars_mapping = {
    'insurance': ['insurance_Medicare', 'insurance_Private', 'insurance_Medicaid', 'insurance_Government', 'insurance_Self Pay'],
    'admission_location': [
        'admission_location_EMERGENCY ROOM ADMIT', 
        'admission_location_TRANSFER FROM HOSP/EXTRAM',
        'admission_location_CLINIC REFERRAL/PREMATURE',
        'admission_location_PHYS REFERRAL/NORMAL DELI',
        'admission_location_TRANSFER FROM SKILLED NUR'
    ],
    'admission_type': ['admission_type_EMERGENCY', 'admission_type_ELECTIVE', 'admission_type_URGENT']
}
continuous_vars = ['los_x', 'positiveculture', 'gcs', 'gcsmotor', 'lactate', 
                   'bloodureanitrogen', 'hemoglobin', 'intnormalisedratio', 
                   'albumin', 'chloride', 'hematocrit', 'age_years', 'gcsverbal']

# Process categorical variables
for col in categorical_vars:
    if col in data_before.columns:
        total_before = data_before[col].notnull().sum()
        total_after = len(data_after)
        value_counts_before = data_before[col].value_counts()
        percentages_before = data_before[col].value_counts(normalize=True) * 100

        # Map original variables to modified dummy variables
        dummy_vars = dummy_vars_mapping.get(col, [])
        for val in value_counts_before.index:
            percentage_before = percentages_before[val]

            # Construct corresponding dummy variable name
            modified_col = f"{col}_{val}"
            if modified_col in data_after.columns:
                n_after = data_after[modified_col].sum()
                percent_after = (n_after / total_after) * 100

                # Add only rows where N (After) > 0
                if n_after > 0:
                    # Add to comparison table
                    comparison_table.add_row([
                        col,  # Original variable
                        modified_col,  # Modified variable
                        total_before,  # N (Before), repeated for each subcategory
                        f"{percentage_before:.2f}",  # % (Before)
                        int(n_after),  # N (After)
                        f"{percent_after:.2f}",  # % (After)
                        "",  # Mean and SD not applicable for categorical variables
                        ""   # Mean and SD not applicable for categorical variables
                    ])
            else:
                continue  # Skip if dummy variable does not exist in the processed dataset

# Process continuous variables
for col in continuous_vars:
    if col in data_before.columns or col in data_after.columns:
        total_before = data_before[col].notnull().sum() if col in data_before.columns else 0
        total_after = data_after[col].notnull().sum() if col in data_after.columns else 0
        mean_before = data_before[col].mean() if col in data_before.columns else None
        std_before = data_before[col].std() if col in data_before.columns else None
        mean_after = data_after[col].mean() if col in data_after.columns else None
        std_after = data_after[col].std() if col in data_after.columns else None

        comparison_table.add_row([
            col,
            col,  # Continuous variable remains unchanged
            total_before,  # N (Before)
            "",
            total_after,  # N (After)
            "",
            f"{mean_before:.2f} ({std_before:.2f})" if mean_before is not None else "",
            f"{mean_after:.2f} ({std_after:.2f})" if mean_after is not None else ""
        ])

# Save and preview table
with open("Corrected_EDA_Table.txt", "w") as f:
    f.write(comparison_table.get_string())
print(comparison_table)


+--------------------+----------------------------------------------+------------+------------+-----------+-----------+------------------+-----------------+
| Original Variable  |              Modified Variable               | N (Before) | % (Before) | N (After) | % (After) | Mean (SD) Before | Mean (SD) After |
+--------------------+----------------------------------------------+------------+------------+-----------+-----------+------------------+-----------------+
|     insurance      |              insurance_Medicare              |  2938748   |   54.03    |   10931   |   54.38   |                  |                 |
| admission_location |   admission_location_EMERGENCY ROOM ADMIT    |  2938748   |   45.28    |    8188   |   40.73   |                  |                 |
| admission_location | admission_location_TRANSFER FROM HOSP/EXTRAM |  2938748   |   26.42    |    3970   |   19.75   |                  |                 |
| admission_location | admission_location_CLINIC REFERRAL/