## Preliminary processing

### Import packages and read in data

In [1]:
import os

import pandas as pd

src_file = "data/standard_data.csv"
dst_dir = "data"
dst_file = "formatted_data.csv"

df = pd.read_csv(src_file)

### Format data values

In [2]:
# Gender transformation: 1--male, 0--female
df["Gender"] = df["Gender"].replace(2, 0)

# Reserve y-m-d precision for `RecordTime`, `AdmissionTime` and `DischargeTime` columns
df["RecordTime"] = pd.to_datetime(df["RecordTime"]).dt.strftime("%Y-%m-%d")
df["DischargeTime"] = pd.to_datetime(df["DischargeTime"]).dt.strftime("%Y-%m-%d")
df["AdmissionTime"] = pd.to_datetime(df["AdmissionTime"]).dt.strftime("%Y-%m-%d")

### Clean data

In [3]:
# Exclude patients with missing labels
df = df.dropna(subset=["PatientID", "RecordTime", "DischargeTime"], how="any")

# Drop columns whose values are all NaN ('2019-nCoV nucleic acid detection')
df = df.drop(columns=["2019-nCoV nucleic acid detection"])

### Merge data

In [4]:
# Merge data by PatientID and RecordTime
df = df.groupby(
    ["PatientID", "RecordTime", "AdmissionTime", "DischargeTime"],
    dropna=True,
    as_index=False,
).mean()

### Calculate the Length-of-Stay (LOS) label

In [5]:
# Calculate LOS (Length of Stay) in days and insert it after the column `Outcome`
df.insert(
    5,
    "LOS",
    (pd.to_datetime(df["DischargeTime"]) - pd.to_datetime(df["RecordTime"])).dt.days.astype(float)
)

# Notice: Set negative LOS values to 0
df["LOS"] = df["LOS"].apply(lambda x: 0 if x < 0 else x)

### Export formatted table

In [6]:
os.makedirs(dst_dir, exist_ok=True)
df.to_csv(os.path.join(dst_dir, dst_file), index=False)