In [None]:
import sys
from pathlib import Path
import pandas as pd
import os
from config import Config as paths

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.cleaners.baseCleaner import BaseCleaner
from data_cleaning.renaming import generate_and_save_rename_columns_json, rename_columns

## Load Raw Data

In [None]:
melior_ov_raw = pd.read_parquet(paths.MELIOR_OV_PATH)
melior_sv_raw = pd.read_parquet(paths.MELIOR_SV_PATH)
microbiology = pd.read_parquet(paths.MICROBIOLOGY_DEDUB_PATH)

## Generate Renaming files

In [None]:
generate_and_save_rename_columns_json(
    melior_ov_raw, f"{paths.RENAME_FILES_PATH_HOSPITALISATION}/melior_ov_rename_columns.json"
)
generate_and_save_rename_columns_json(
    melior_sv_raw, f"{paths.RENAME_FILES_PATH_HOSPITALISATION}/melior_sv_rename_columns.json"
)

## Rename and clean data

In [None]:
base_cleaner = BaseCleaner()

melior_sv_renamed = rename_columns(
    melior_sv_raw, f"{paths.RENAME_FILES_PATH_HOSPITALISATION}/melior_sv_rename_columns.json"
)
melior_ov_renamed = rename_columns(
    melior_ov_raw, f"{paths.RENAME_FILES_PATH_HOSPITALISATION}/melior_ov_rename_columns.json"
)

## Combine and Rename columns

In [None]:
combined_patient_reference = base_cleaner.concat_data(
    melior_sv_renamed, melior_ov_renamed
)

combined_patient_reference = base_cleaner.clean_data(combined_patient_reference)

## Map to Episode

In [None]:
microbiology_cleaned = base_cleaner.clean_data(microbiology)

reference = pd.merge(
    microbiology_cleaned[["episode_id", "sample_id", "sample_date","year","age","sex"]],
    combined_patient_reference,
    on=["sample_id"],
    how="left",
)

reference["hosp_start"] = reference["hosp_start"].dt.tz_localize(None)
reference["hosp_stop"] = reference["hosp_stop"].dt.tz_localize(None)


## Make sure that all episodes have a patient id

In [None]:
reference = pd.merge(
    reference,
    microbiology_cleaned[["episode_id"]].drop_duplicates(),
    on=["episode_id"],
    how="right",
)

In [None]:
# make sure that a patient_id is seven characters long
reference["patient_id"] = reference["episode_id"].str.split("_").str[0].astype(int)

## Add Age Group

In [None]:
bins = [i * 10 for i in range(11)]
bins_labels = [f"{i}-{i+9}" for i in range(0, 100, 10)]
bins[-1] = 140
bins_labels[0] = "1-9"
bins_labels[-1] = "90+"

reference.loc[:, "age_group"] = pd.cut(
    reference["age"],
    bins=bins,
    labels=bins_labels,
    include_lowest=True,
    right=False,
)

## Save Data


In [None]:
if not os.path.exists(paths.STORE_REFERENCE_DATA_PATH):
    os.makedirs(paths.STORE_REFERENCE_DATA_PATH)


reference.to_parquet(paths.REFERENCE_DATA_PATH, index=False)