# Hospitalisation Data Processing Pipeline

In [None]:
import sys
from pathlib import Path
import pandas as pd
import sys
import os
from config import Config as paths
project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    rename_columns,
    generate_and_save_rename_values_json,
    rename_values,
)
from data_cleaning.cleaners.episode.episodeCleaner import EpisodeCleaner

cleaner = EpisodeCleaner()

## Load Data

In [None]:
melior_sva = pd.read_parquet(paths.MELIOR_SV_PATH)
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)

## Rename columns

In [None]:
melior_sva_renamed = rename_columns(
    df = melior_sva, path = '../rename_files/hospitalisation/melior_sv_rename_columns.json'
).drop_duplicates()

melior_sva_renamed['hosp_start'] = melior_sva_renamed['hosp_start'].dt.tz_localize(None)
melior_sva_renamed['hosp_stop'] = melior_sva_renamed['hosp_stop'].dt.tz_localize(None)

In [None]:
melior_sva_without_ER = melior_sva_renamed[~melior_sva_renamed.hosp_site.str.contains('Aku|aku')].copy()

## Combine hospitalisations if overlapping

In [None]:
melior_sva_combined = cleaner.assign_block_id(melior_sva_renamed, 'patient_id', 'hosp_start', 'hosp_stop', time=0)

melior_sva_combined = melior_sva_combined.groupby(['patient_id', 'block_id']).agg({
    'hosp_id': 'first',
    'hosp_start': 'min',
    'hosp_stop': 'max',
    'hosp_site': lambda x: ' | '.join(x.dropna().unique()),
}).reset_index(drop=True)


melior_sva_without_ER_combined = cleaner.assign_block_id(melior_sva_without_ER, 'patient_id', 'hosp_start', 'hosp_stop', time=0)

melior_sva_without_ER_combined = melior_sva_without_ER_combined.groupby(['patient_id', 'block_id']).agg({
    'hosp_id': 'first',
    'hosp_start': 'min',
    'hosp_stop': 'max',
    'hosp_site': lambda x: ' | '.join(x.dropna().unique()),
}).reset_index(drop=True)






## Find hospitalisation with culture

In [None]:
hosp_with_episode = melior_sva_combined.merge(
    reference_data[['episode_id','sample_date','hosp_id']].drop_duplicates(),
    on='hosp_id',
    how='right'
)

hosp_without_ER_with_episodes = melior_sva_without_ER_combined.merge(
        reference_data[['episode_id','sample_date','hosp_id']].drop_duplicates(),
    on='hosp_id',
    how='right'

)


### Sample date taken within +- 3 days from a hospitalisation

In [None]:
hosp_with_episode_with_culture = hosp_with_episode[hosp_with_episode.sample_date.between(hosp_with_episode.hosp_start - pd.Timedelta(days=3), hosp_with_episode.hosp_stop + pd.Timedelta(days=3))].copy()
hosp_with_episode_with_culture = hosp_with_episode_with_culture.sort_values(['episode_id', 'hosp_stop'],ascending=[True,False]).drop_duplicates('episode_id')
hosp_with_episode_with_culture = hosp_with_episode_with_culture.rename({
    'hosp_start': 'hosp_start_with_culture',
    'hosp_stop': 'hosp_stop_with_culture',
    'hosp_site': 'hosp_site_with_culture'
}, axis=1)
hosp_with_episode_with_culture = hosp_with_episode_with_culture[['episode_id', 'hosp_start_with_culture', 'hosp_stop_with_culture', 'hosp_site_with_culture']].drop_duplicates()

hosp_without_ER_with_culture = hosp_without_ER_with_episodes[hosp_without_ER_with_episodes.sample_date.between(hosp_without_ER_with_episodes.hosp_start - pd.Timedelta(days=3), hosp_without_ER_with_episodes.hosp_stop + pd.Timedelta(days=3))].copy()
hosp_without_ER_with_culture = hosp_without_ER_with_culture.sort_values(['episode_id', 'hosp_stop'],ascending=[True,False]).drop_duplicates('episode_id')
hosp_without_ER_with_culture = hosp_without_ER_with_culture.rename(columns={
    'hosp_start': 'hosp_start_with_culture',
    'hosp_stop': 'hosp_stop_with_culture',
    'hosp_site': 'hosp_site_with_culture'
}, axis=1)
hosp_without_ER_with_culture = hosp_without_ER_with_culture[['episode_id', 'hosp_start_with_culture', 'hosp_stop_with_culture', 'hosp_site_with_culture']].drop_duplicates()

## Calculate hospitalisation times

In [None]:
hosp_times_30 = cleaner.calculate_hospitalisation_times(hosp_without_ER_with_episodes, ['episode_id'], 'sample_date', 'hosp_start', 'hosp_stop', 30)
hosp_times_365 = cleaner.calculate_hospitalisation_times(hosp_without_ER_with_episodes, ['episode_id'], 'sample_date', 'hosp_start', 'hosp_stop', 365)

In [None]:
hosp_times = hosp_times_30.merge(hosp_times_365, on='episode_id', how='outer')
hosp_times = reference_data[['episode_id']].drop_duplicates().merge(hosp_times, on='episode_id', how='left').fillna(0)

## Next hospitalisation

In [None]:
# behåller enbart vårdtillfällen som händer efter provtagningsdatum
next_hosp = hosp_without_ER_with_episodes[hosp_without_ER_with_episodes['hosp_start'] > hosp_without_ER_with_episodes['sample_date']].copy()

# antalet dagar till nästa vårdtillfälle
next_hosp['days_to_readmission'] = (next_hosp['hosp_start'] - next_hosp['sample_date']).dt.days

# välj första vårdtillfället
next_hosp = next_hosp.sort_values(['episode_id', 'hosp_start'])
next_readmission = next_hosp.groupby('episode_id').first().reset_index()

next_readmission = next_readmission[['episode_id', 'hosp_start', 'hosp_stop', 'hosp_site', 'days_to_readmission']].rename(columns={
    'hosp_start': 'readmission_start',
    'hosp_stop': 'readmission_stop',
    'hosp_site': 'readmission_site'
})

next_readmission

## Combine

In [None]:
hosp_combined = hosp_without_ER_with_culture.copy()
hosp_combined = pd.merge(hosp_times,hosp_combined,on='episode_id',how='outer')
hosp_combined = pd.merge(hosp_combined,next_readmission,on='episode_id',how='outer')
hosp_combined

## Save

In [None]:
if not os.path.exists(paths.STORE_PATH + "/hospitalisation"):
    os.makedirs(paths.STORE_PATH + "/hospitalisation")
sva_cleaned.to_parquet(f"{paths.STORE_PATH}/hospitalisation/hosp_sva_cleaned.parquet")