In [None]:
import sys
from pathlib import Path
import pandas as pd
import sys
import os
from config import Config as paths

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.cleaners.episode.clean_data_labs_vital import LabVitalsCleaner
from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    generate_and_save_rename_values_json,
)
from data_cleaning.renaming import rename_columns, rename_values
from data_cleaning.renaming import generate_and_save_rename_values_json, rename_values


cleaner = LabVitalsCleaner()

## Import Data


In [None]:
laboratory_data = pd.read_parquet(paths.LABORATORY_PATH)
vital_data = pd.read_parquet(paths.VITALS_PATH)
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)

## Generate Renaming Files for Columns


In [None]:
generate_and_save_rename_columns_json(
    df=laboratory_data,
    file_path=f"{paths.RENAME_FILES_LABORATORY}/lab_rename_columns.json",
)
generate_and_save_rename_columns_json(
    df=vital_data,
    file_path=f"{paths.RENAME_FILES_LABORATORY}/vital_rename_columns.json",
)

## Rename Columns


In [None]:
laboratory_renamed = rename_columns(
    laboratory_data, f"{paths.RENAME_FILES_LABORATORY}/lab_rename_columns.json"
)
vital_renamed = rename_columns(
    vital_data, f"{paths.RENAME_FILES_LABORATORY}/vital_rename_columns.json"
)

## Generate Renaming Files for Values


In [None]:
generate_and_save_rename_values_json(
    df=laboratory_renamed,
    file_path=f"{paths.RENAME_FILES_LABORATORY}/lab_rename_values.json",
    limit=200,
)
generate_and_save_rename_values_json(
    df=vital_renamed,
    file_path=f"{paths.RENAME_FILES_LABORATORY}/vital_rename_values.json",
    limit=200,
)

## Rename Values


In [None]:
laboratory_values_renamed = rename_values(
    laboratory_renamed, f"{paths.RENAME_FILES_LABORATORY}/lab_rename_values.json"
)
vital_values_renamed = rename_values(
    vital_renamed, f"{paths.RENAME_FILES_LABORATORY}/vital_rename_values.json"
)

## Clean Data


In [None]:
laboratory_cleaned = cleaner.clean_data(laboratory_values_renamed)
vital_cleaned = cleaner.clean_data(vital_values_renamed)

## Add patient_id


In [None]:
patient_id_hosp_id = reference_data[["patient_id", "hosp_id"]].drop_duplicates()

laboratory_with_patient_id = pd.merge(
    patient_id_hosp_id, laboratory_cleaned, on="hosp_id", how="left"
)
laboratory_with_patient_id = laboratory_with_patient_id[
    laboratory_with_patient_id.hosp_id.notnull()
    & laboratory_with_patient_id.lab_result_date.notnull()
]
laboratory_with_patient_id = cleaner.clean_data(laboratory_with_patient_id)

In [None]:
vital_with_patient_id = pd.merge(
    patient_id_hosp_id, vital_cleaned, on="hosp_id", how="left"
)
vital_with_patient_id = vital_with_patient_id[
    vital_with_patient_id.hosp_id.notnull()
    & vital_with_patient_id.vital_result.notnull()
]
vital_with_patient_id = cleaner.clean_data(vital_with_patient_id)

## Clean Measurements


In [None]:

laboratory_with_patient_id_cleaned = cleaner.clean_data(
    cleaner.clean_measurement(df=laboratory_with_patient_id, col_name="lab_result")
)
vital_with_patient_id_cleaned = cleaner.clean_data(
    cleaner.clean_measurement(df=vital_with_patient_id, col_name="vital_result")
)

In [None]:
ranges = {}

laboratory_with_patient_id_cleaned = cleaner.calculate_reasonability_lab(
    df=laboratory_with_patient_id_cleaned,
    result_column="lab_result_cleaned",
    lab_name="lab_name",
    ranges=ranges,
)


ranges = {
    "news_score": (0, 25),
    "diastolic_blood_pressure": (20, 200),
    "temperature": (30, 45),
    "systolic_blood_pressure": (30, 300),
    "pulse": (10, 300),
    "respiratory_rate": (5, 80),
    "oxygen_saturation": (40, 100),
    "news_score_low_oxygen_saturation": (0, 100),
    "onews_score": (0, 25),
    "oxygen_saturation_percentage": (0, 100),
}


vital_with_patient_id_cleaned = cleaner.calculate_reasonability_vitals(
    df=vital_with_patient_id_cleaned,
    result_column="vital_result_cleaned",
    vital_name="vital_name",
    ranges=ranges,
)

In [None]:
vital_with_patient_id_cleaned = vital_with_patient_id_cleaned[
    vital_with_patient_id_cleaned.reasonable
]

laboratory_with_patient_id_cleaned = laboratory_with_patient_id_cleaned[
    laboratory_with_patient_id_cleaned.reasonable
]

## Map Data to Episodes


In [None]:

DAYS_BEFORE_BASELINE = pd.Timedelta(24, unit="hours")
DAYS_AFTER_BASELINE = pd.Timedelta(24, unit="hours")

laboratory_mapped_24 = cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=laboratory_with_patient_id_cleaned,
    patient_id_col_name="patient_id",
    date_col_name="lab_result_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)

vital_mapped_24 = cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=vital_with_patient_id_cleaned,
    patient_id_col_name="patient_id",
    date_col_name="vital_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)

## Remove Missing lab and vital names

In [None]:
laboratory_mapped_24 = laboratory_mapped_24[laboratory_mapped_24.lab_result_cleaned.notnull()]
vital_mapped_24 = vital_mapped_24[vital_mapped_24.vital_result_cleaned.notnull()]

## Pivot data

In [None]:
laboratory_first_measurements_24_hours = laboratory_mapped_24.sort_values('diff').drop_duplicates(['episode_id','lab_name']).pivot(
    index=['episode_id'],
    columns='lab_name',
    values='lab_result_cleaned'
).reset_index()

if 'Remove' in laboratory_first_measurements_24_hours.columns:
    laboratory_first_measurements_24_hours = laboratory_first_measurements_24_hours.drop(columns=['Remove'])

vital_first_measurements_24_hours = vital_mapped_24.sort_values('diff').drop_duplicates(['episode_id','vital_name']).pivot(
    index=['episode_id'],
    columns='vital_name',
    values='vital_result_cleaned'
).reset_index()

if 'Remove' in vital_first_measurements_24_hours.columns:
    vital_first_measurements_24_hours = vital_first_measurements_24_hours.drop(columns=['Remove'])



## Combine

In [None]:
laboratory_vitals_first_measurements_24_hours = pd.merge(
    laboratory_first_measurements_24_hours,
    vital_first_measurements_24_hours,
    on="episode_id",
    how="outer",
)


## Save Processed Data


In [None]:
if not os.path.exists(paths.STORE_PATH + "/laboratory_vitals"):
    os.makedirs(paths.STORE_PATH + "/laboratory_vitals")

laboratory_vitals_first_measurements_24_hours.to_parquet(
    paths.STORE_PATH + "/laboratory_vitals" + "/laboratory_vitals_first_measurements_24_hours.parquet"
)