In [None]:
import sys
from pathlib import Path
from config import Config as paths
import pandas as pd
import sys
import os

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.cleaners.episode.episodeCleaner import EpisodeCleaner

cleaner = EpisodeCleaner()

## Import Data


In [None]:
medicine_data = pd.read_parquet(paths.MEDICINE_PRESCRIPTION)
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)

## Generate Renaming Files


In [None]:
from data_cleaning.renaming import generate_and_save_rename_columns_json


generate_and_save_rename_columns_json(
    df=medicine_data, file_path=f"{paths.RENAME_FILES_MEDICINE}/medicine_rename_columns.json"
)

## Rename Columns


In [None]:
from data_cleaning.renaming import rename_columns


medicine_renamed = rename_columns(
    medicine_data, f"{paths.RENAME_FILES_MEDICINE}/medicine_rename_columns.json"
)

In [None]:
medicine_renamed['patient_id'] = medicine_renamed['patient_id'].astype(int)

## Clean Data


In [None]:
medicine_cleaned = cleaner.clean_data(medicine_renamed)

## Map Data to Episodes


In [None]:
DAYS_BEFORE_BASELINE = pd.Timedelta(30, unit="days")
DAYS_AFTER_BASELINE = -pd.Timedelta(1, unit="seconds")

medicine_mapped = cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=medicine_cleaned,
    patient_id_col_name="patient_id",
    date_col_name="medicine_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)

## Deduplicate medicine


In [None]:
medicine_mapped = medicine_mapped.sort_values(
    "diff"
).drop_duplicates(subset=["episode_id", "atc_code"])

## Add Indicators


In [None]:
medicine_mapped["immunpsupp"] = cleaner.get_prefix_match_mask(
    df=medicine_mapped,
    target_cols=["atc_code"],
    prefixes=["A02BC"],
).astype(int)

medicine_mapped["antibiotics"] = cleaner.get_prefix_match_mask(
    df=medicine_mapped,
    target_cols=["atc_code"],
    prefixes=["J01"],
).astype(int)

## Summarize Episodes


In [None]:
def summary(df):
    episode_id = df["episode_id"].iloc[0]
    antibiotic = df["antibiotics"].max()
    immunsupp = df["immunpsupp"].max()
    return {
        "episode_id": episode_id,
        "antibiotics_30_days_before": antibiotic,
        "immunsupp_medicine_30_days_before": immunsupp,
    }


medicine_summary = cleaner.summarize_data_by_episode(
    df=medicine_mapped, episode_id_col="episode_id", summary_function=summary
)

## Save Processed Data


In [None]:
if not os.path.exists(paths.STORE_MEDICINE_DATA_PATH):
    os.makedirs(paths.STORE_MEDICINE_DATA_PATH)

medicine_mapped.to_parquet(f"{paths.STORE_MEDICINE_DATA_PATH}/medicine_episode_mapped.parquet")
medicine_summary.to_parquet(f"{paths.STORE_MEDICINE_DATA_PATH}/medicine_episode_summary.parquet")