# Data cleaning

In [None]:
import pandas as pd
import os
os.getcwd()

os.chdir(r"c:\dev\Hospital_ml\Predictive analysis of critical incidents in a hospital")

primary = pd.read_csv("data/primary_cohort.csv")
study = pd.read_csv("data/study_cohort.csv")
validation = pd.read_csv("data/validation_cohort.csv")

cohorts = {
    "primary": primary,
    "study": study,
    "validation": validation
}


## Verify data consistency
### Age

In [56]:
primary["age_years"].describe()

count    110204.000000
mean         62.735255
std          24.126806
min           0.000000
25%          51.000000
50%          68.000000
75%          81.000000
max         100.000000
Name: age_years, dtype: float64

In [57]:
(primary["age_years"] < 0).sum()
(primary["age_years"] > 120).sum()

np.int64(0)

### Gender

In [58]:
primary["sex_0male_1female"].value_counts()


sex_0male_1female
0    57973
1    52231
Name: count, dtype: int64

### Hopital Outcome

In [59]:
primary["hospital_outcome_1alive_0dead"].value_counts()

hospital_outcome_1alive_0dead
1    102099
0      8105
Name: count, dtype: int64

## Verify the structure

In [60]:
primary.columns.equals(study.columns)
primary.columns.equals(validation.columns)


True

## Intra-cohort cleaning
### Minimal diagnosis

In [61]:
for name, df in cohorts.items():
    print(f"\n--- {name.upper()} COHORT ---")
    print(df.shape)
    print(df.isna().sum())
    print("Duplicated rows:", df.duplicated().sum())


--- PRIMARY COHORT ---
(110204, 4)
age_years                        0
sex_0male_1female                0
episode_number                   0
hospital_outcome_1alive_0dead    0
dtype: int64
Duplicated rows: 108693

--- STUDY COHORT ---
(19051, 4)
age_years                        0
sex_0male_1female                0
episode_number                   0
hospital_outcome_1alive_0dead    0
dtype: int64
Duplicated rows: 17861

--- VALIDATION COHORT ---
(137, 4)
age_years                        0
sex_0male_1female                0
episode_number                   0
hospital_outcome_1alive_0dead    0
dtype: int64
Duplicated rows: 33


### Use an intra-cohort cleaning function

In [72]:
def clean_cohort(df):
    df = df.copy()
    
    df = df.drop_duplicates()

    df = df[(df["age_years"] >= 0) & (df["age_years"] <= 120)]

    df = df[df["sex_0male_1female"].isin([0, 1])]
    df = df[df["hospital_outcome_1alive_0dead"].isin([0, 1])]

    return df

for name in cohorts:
    cohorts[name] = clean_cohort(cohorts[name])


### post-cleaning verification

In [73]:
for name, df in cohorts.items():
    print(f"\n--- {name.upper()} AFTER CLEANING ---")
    print(df.shape)
    print("Duplicated rows:", df.duplicated().sum())



--- PRIMARY AFTER CLEANING ---
(1511, 4)
Duplicated rows: 0

--- STUDY AFTER CLEANING ---
(1190, 4)
Duplicated rows: 0

--- VALIDATION AFTER CLEANING ---
(104, 4)
Duplicated rows: 0


## Create a clean, final CSV file

In [74]:
cohorts["primary"].to_csv("data/primary_cohort_clean.csv", index=False)
cohorts["study"].to_csv("data/study_cohort_clean.csv", index=False)
cohorts["validation"].to_csv("data/validation_cohort_clean.csv", index=False)