# Hospitalisation Data Processing Pipeline

In [None]:
import sys
from pathlib import Path
import pandas as pd
import sys
import os
from config import Config as paths
project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    rename_columns,
    generate_and_save_rename_values_json,
    rename_values,
)
from data_cleaning.cleaners.episode.episodeCleaner import EpisodeCleaner

cleaner = EpisodeCleaner()

## Load Data

In [None]:
melior_sva = pd.read_parquet(paths.MELIOR_SV_PATH)
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)

## Rename columns

In [None]:
melior_sva_renamed = rename_columns(
    df = melior_sva, path = '../rename_files/hospitalisation/melior_sv_rename_columns.json'
).drop_duplicates()

melior_sva_renamed['hosp_start'] = melior_sva_renamed['hosp_start'].dt.tz_localize(None)
melior_sva_renamed['hosp_stop'] = melior_sva_renamed['hosp_stop'].dt.tz_localize(None)

melior_sva_renamed = melior_sva_renamed.dropna(subset=['hosp_start','hosp_stop'])
melior_sva_renamed['patient_id'] = melior_sva_renamed['patient_id'].astype(int)
reference_data['patient_id'] = reference_data['patient_id'].astype(int)

In [None]:
melior_sva_without_ER = melior_sva_renamed[~melior_sva_renamed.hosp_site.str.contains('Aku|aku')].copy()

## Combine hospitalisations if overlapping

In [None]:
def assign_block_id(df, patient_id, start, stop, time=0):
    """
    Function for assigning block ids to overlapping time periods for each patient. This is useful for combining overlapping hospitalisations

    Parameters
    ----------
    df : _type_
        Dataframe containing dates
    patient_id : _type_
        column name of the column containing patient ids
    start : _type_
        column name of the column containing the start dates
    stop : _type_
        column name of the column containing the stop dates
    time : int, optional
        Minimum time gap between hospitalisations, by default 0

    Returns
    -------
    Original DataFrame with an additional column "block_id" indicating overlapping periods.
    """
    df = df.copy()
    start_date = start + '_date'
    stop_date = stop + '_date'

    df[start_date] = df[start].dt.normalize()
    df[stop_date] = df[stop].dt.normalize()
    df = df.sort_values([patient_id, start_date, stop_date]).reset_index(drop=True)

    prev_max_end = df.groupby(patient_id)[stop_date].cummax().shift()
    new_block = (df[start_date] > (prev_max_end + pd.Timedelta(days=time))) | prev_max_end.isna()
    df["block_id"] = new_block.groupby(df[patient_id]).cumsum()
    return df

In [None]:
melior_sva_combined = assign_block_id(melior_sva_renamed, 'patient_id', 'hosp_start', 'hosp_stop', time=0)

melior_sva_combined = melior_sva_combined.groupby(['patient_id', 'block_id']).agg({
    'patient_id': 'first',
    'hosp_start': 'min',
    'hosp_stop': 'max',
    'hosp_site': lambda x: ' | '.join(x.dropna().unique()),
}).reset_index(drop=True)


melior_sva_without_ER_combined = assign_block_id(melior_sva_without_ER, 'patient_id', 'hosp_start', 'hosp_stop', time=0)

melior_sva_without_ER_combined = melior_sva_without_ER_combined.groupby(['patient_id', 'block_id']).agg({
    'patient_id': 'first',
    'hosp_start': 'min',
    'hosp_stop': 'max',
    'hosp_site': lambda x: ' | '.join(x.dropna().unique()),
}).reset_index(drop=True)


## Find hospitalisation with culture

In [None]:
hosp_with_episode = melior_sva_combined.merge(
    reference_data[['episode_id','sample_date','patient_id']].dropna().drop_duplicates(),
    on='patient_id',
    how='right'
)


hosp_with_episode_with_culture = hosp_with_episode[hosp_with_episode.sample_date.between(hosp_with_episode.hosp_start - pd.Timedelta(days=3), hosp_with_episode.hosp_stop + pd.Timedelta(days=3))].copy()
hosp_with_episode_with_culture = hosp_with_episode_with_culture.sort_values(['episode_id', 'hosp_stop'],ascending=[True,False]).drop_duplicates('episode_id')
hosp_with_episode_with_culture = hosp_with_episode_with_culture.rename(columns={
    'hosp_start': 'hosp_start_with_culture',
    'hosp_stop': 'hosp_stop_with_culture',
    'hosp_site': 'hosp_site_with_culture'
})
hosp_with_episode_with_culture = hosp_with_episode_with_culture[['episode_id', 'hosp_start_with_culture', 'hosp_stop_with_culture', 'hosp_site_with_culture']].drop_duplicates()


In [None]:
hosp_without_ER_with_episodes = melior_sva_without_ER_combined.merge(
        reference_data[['episode_id','sample_date','patient_id']].dropna().drop_duplicates(),
    on='patient_id',
    how='right'

)


hosp_without_ER_with_culture = hosp_without_ER_with_episodes[hosp_without_ER_with_episodes.sample_date.between(hosp_without_ER_with_episodes.hosp_start - pd.Timedelta(days=3), hosp_without_ER_with_episodes.hosp_stop + pd.Timedelta(days=3))].copy()
hosp_without_ER_with_culture = hosp_without_ER_with_culture.sort_values(['episode_id', 'hosp_stop'],ascending=[True,False]).drop_duplicates('episode_id')
hosp_without_ER_with_culture = hosp_without_ER_with_culture.rename(columns={
    'hosp_start': 'hosp_start_with_culture',
    'hosp_stop': 'hosp_stop_with_culture',
    'hosp_site': 'hosp_site_with_culture'
})
hosp_without_ER_with_culture = hosp_without_ER_with_culture[['episode_id', 'hosp_start_with_culture', 'hosp_stop_with_culture', 'hosp_site_with_culture']].drop_duplicates()

In [None]:
hosp_with_episode

## Calculate hospitalisation times

In [None]:
def calculate_hospitalisation_times(df, unique_id, all_unique_ids, baseline, start, stop, time, window_direction="after"):
    df = df.copy()

    for col in [baseline, start, stop]:
        df[col] = pd.to_datetime(df[col])

    start_date = start + "_date"
    stop_date = stop + "_date"
    df[start_date] = df[start]
    df[stop_date] = df[stop]
    df["baseline_date"] = df[baseline]

    # choose if lookback or to look forward
    if window_direction.lower() == "after":
        window_start = df["baseline_date"] + pd.Timedelta(days=1)
        window_end = df["baseline_date"] + pd.Timedelta(days=time)
    elif window_direction.lower() == "before":
        window_start = df["baseline_date"] - pd.Timedelta(days=time)
        window_end = df["baseline_date"] - pd.Timedelta(days=1)
    

    # trunc dates
    df["start_date_trunc"] = df[start_date].clip(lower=window_start)
    df["stop_date_trunc"] = df[stop_date].clip(upper=window_end)

    # number of days between dates
    df["diff"] = (df["stop_date_trunc"] - df["start_date_trunc"]).dt.days + 1

    # Keep only valid periods
    df = df[df["start_date_trunc"] <= df["stop_date_trunc"]]

    # Sum days by episode
    direction_suffix = "prior" if window_direction.lower() == "before" else "after"
    column_name = f"hosp_time_{direction_suffix}_{time}_days"
    
    df_hosp_times = (
        df.groupby(unique_id)
        .agg({"diff": "sum"})
        .reset_index()
        .rename(columns={"diff": column_name})
    )

    # add rest of the episodes
    #df_hosp_times = all_unique_ids.merge(df_hosp_times, on=unique_id, how="left")
    #df_hosp_times[column_name] = df_hosp_times[column_name].fillna(0).astype(int)
    

    return df_hosp_times



# Calculate hospitalization time in 30-day window
hosp_time_30_after = calculate_hospitalisation_times(
    df = hosp_with_episode,
    unique_id="episode_id",
    all_unique_ids = reference_data[['episode_id']].drop_duplicates(),
    baseline="sample_date",
    start="hosp_start",
    stop="hosp_stop",
    time=30,
    window_direction='after'
)

hosp_time_365_after = calculate_hospitalisation_times(
    df = hosp_with_episode,
    unique_id="episode_id",
    all_unique_ids = reference_data[['episode_id']].drop_duplicates(),
    baseline="sample_date",
    start="hosp_start",
    stop="hosp_stop",
    time=365,
    window_direction='after'
)


In [None]:
hosp_time_30_after.hosp_time_after_30_days.median()

## Next hospitalisation

In [None]:
# behåller enbart vårdtillfällen som händer efter provtagningsdatum
next_hosp = hosp_without_ER_with_episodes[hosp_without_ER_with_episodes['hosp_start'] > hosp_without_ER_with_episodes['sample_date']].copy()

# antalet dagar till nästa vårdtillfälle
next_hosp['days_to_readmission'] = (next_hosp['hosp_start'] - next_hosp['sample_date']).dt.days

# välj första vårdtillfället
next_hosp = next_hosp.sort_values(['episode_id', 'hosp_start'])
next_readmission = next_hosp.groupby('episode_id').first().reset_index()

next_readmission = next_readmission[['episode_id', 'hosp_start', 'hosp_stop', 'hosp_site', 'days_to_readmission']].rename(columns={
    'hosp_start': 'readmission_start',
    'hosp_stop': 'readmission_stop',
    'hosp_site': 'readmission_site'
})

next_readmission

## Combine

In [None]:
hosp_combined = hosp_without_ER_with_culture.copy()
hosp_combined = pd.merge(hosp_times,hosp_combined,on='episode_id',how='outer')
hosp_combined = pd.merge(hosp_combined,next_readmission,on='episode_id',how='outer')
hosp_combined

## Save

In [None]:
if not os.path.exists(paths.STORE_PATH + "/hospitalisation"):
    os.makedirs(paths.STORE_PATH + "/hospitalisation")
#sva_cleaned.to_parquet(f"{paths.STORE_PATH}/hospitalisation/hosp_sva_cleaned.parquet")