# Outcome Data Processing Pipeline

In [None]:
import pandas as pd
import os
import sys
from config import Config as paths

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_cleaning.cleaners.episode.clean_data_outcomes import OutcomesCleaner

## Read and Preprocess Data

In [None]:
microbiology_data = pd.read_parquet(paths.MICROBIOLOGY_DEDUB_PATH)

In [None]:
cleaner = OutcomesCleaner()
deceased = pd.read_parquet(paths.MORTALITY_PATH)
deceased = deceased.drop(columns=['AvlidenDatumSomStrang', 'Kon'])
deceased = deceased.rename(columns={'RS_PAT_Alias': 'patient_id', 'Avliden': 'deceased', 'AvlidenDatum': 'deceased_date'}).sort_values(
    'patient_id'
).reset_index(drop=True)
deceased = cleaner.clean_data(deceased)

In [None]:
reference = pd.read_parquet(paths.REFERENCE_DATA_PATH)
reference['patient_id'] = reference['patient_id'].astype(int)

In [None]:
rsvd_sva = pd.read_parquet(paths.RSVD_SVA_HOSPITALISATION_PATH)
rsvd_sva = rsvd_sva.rename(columns={'Mikrobiologi_Prov_Alias': 'sample_id', 'RS_PAT_Alias': 'patient_id', 'Indatum': 'in_date',
                                    'Utdatum': 'out_date'})
rsvd_sva_cleaned = cleaner.clean_data(rsvd_sva).sort_values(['patient_id', 'in_date', 'out_date']
                                                                  )[['patient_id', 'in_date', 'out_date']].drop_duplicates(ignore_index=True)

## Mortality

In [None]:

# 30-day mortality
mortality_30_day = cleaner.add_mortality(reference, microbiology_data, deceased, pd.Timedelta(days=30), '30_day_mortality')
# 90-day mortality
mortality_90_day = cleaner.add_mortality(reference, microbiology_data, deceased, pd.Timedelta(days=90), '90_day_mortality')
# 365-day mortality
mortality_365_day = cleaner.add_mortality(reference, microbiology_data, deceased, pd.Timedelta(days=365), '365_day_mortality')
# 3-year mortality
mortality_3_year = cleaner.add_mortality(reference, microbiology_data, deceased, pd.Timedelta(days=365 * 3), '3_year_mortality')

In [None]:
mortality = mortality_30_day.merge(mortality_90_day, how='left').merge(mortality_365_day, how='left').merge(mortality_3_year, how='left'
                                                                                                            ).dropna(subset='episode_id')

## Readmitted

In [None]:
mortality_and_readmitted = cleaner.add_readmitted(mortality, rsvd_sva_cleaned, 'sample_date')

## Days of Care a Year After First Hospitalisation

In [None]:
days_of_care = cleaner.get_days_of_care_after_baseline(rsvd_sva_cleaned, microbiology_data, 365)
mortality_readmitted_days_of_care = mortality_and_readmitted.merge(days_of_care, how='left', on='episode_id')
mortality_readmitted_days_of_care['days_of_care_365_days_after_baseline'] = mortality_readmitted_days_of_care[f'days_of_care_365_days_after_baseline'].fillna(0)

## Days of Care a Year Before Sample Date

In [None]:
days_of_care_before_baseline = cleaner.get_days_of_care_before_baseline(rsvd_sva_cleaned, microbiology_data, 365)
mortality_readmitted_days_of_care = mortality_readmitted_days_of_care.merge(days_of_care_before_baseline, how='left', on='episode_id')
mortality_readmitted_days_of_care['days_of_care_365_days_before_baseline'] = mortality_readmitted_days_of_care['days_of_care_365_days_before_baseline'].fillna(0)

## Deceased in Hospital

In [None]:
# Filter for inpatient care
reference_sv = reference.query('hosp_type == "Slutenvård"')[
    ['patient_id','sample_date', 'hosp_start', 'hosp_stop']
    ].rename(columns={'hosp_start': 'in_date', 'hosp_stop': 'out_date'})

reference_sv[['in_date', 'out_date']] = reference_sv[['in_date', 'out_date']].applymap(lambda x: x.date())

reference_and_rsvd_sva = pd.concat([reference_sv, rsvd_sva_cleaned]).dropna(subset=['in_date', 'out_date'])

reference_and_rsvd_sva = reference_and_rsvd_sva[reference_and_rsvd_sva.sample_date.between(reference_and_rsvd_sva.in_date, reference_and_rsvd_sva.out_date)]

In [None]:
# Check if deceased date is during a hospitalisation
hosps_deceased = reference_and_rsvd_sva.merge(deceased, how='left', on='patient_id')
hosps_deceased['deceased_in_hospital'] = hosps_deceased['deceased_date'].between(hosps_deceased['in_date'], hosps_deceased['out_date'])
deceased_in_hospital = hosps_deceased.groupby('patient_id')['deceased_in_hospital'].any().reset_index()

In [None]:
outcome_data = cleaner.clean_data(mortality_readmitted_days_of_care.merge(deceased_in_hospital, how='left', on='patient_id'))

In [None]:
# We know that deceased in hopsital will only happen once for each patient and it is going to be the last episode
last_episode_idx = outcome_data.groupby('patient_id')['episode_id'].idxmax()

outcome_data['deceased_in_hospital_temp'] = False  # temp value
# Update the last episode for each patient
outcome_data.loc[last_episode_idx, 'deceased_in_hospital_temp'] = outcome_data.loc[last_episode_idx, 'deceased_in_hospital'].values
outcome_data['deceased_in_hospital'] = outcome_data['deceased_in_hospital_temp']
outcome_data = outcome_data.drop(columns=['deceased_in_hospital_temp'])


In [None]:
# Handles special cases
outcome_data.loc[outcome_data['deceased'] == False, 'deceased_in_hospital'] = False
outcome_data.loc[outcome_data['deceased'].isna() | (
    (outcome_data['deceased'] == True) & outcome_data['deceased_date'].isna()), 'deceased_in_hospital'] = None
outcome_data = outcome_data.drop(columns=['patient_id','sample_date']).drop_duplicates()

## Save Processed Data

In [None]:
if not os.path.exists(paths.STORE_OUTCOME_DATA_PATH):
    os.makedirs(paths.STORE_OUTCOME_DATA_PATH)

outcome_data.to_parquet(paths.STORE_OUTCOME_DATA_PATH + '/outcome_data.parquet')