# Diagnosis Data Processing Pipeline

In [None]:
import sys
from pathlib import Path
from config import Config as paths
import pandas as pd
import sys
import os

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    rename_columns,
    generate_and_save_rename_values_json,
    rename_values,
)
from data_cleaning.utils import save_json, load_json
from data_cleaning.cleaners.episode.episodeCleaner import EpisodeCleaner

## Read Data

In [None]:
RSVD_OVA = pd.read_parquet(paths.RSVD_OVA_DIAGNOSIS_PATH)
RSVD_SVA = pd.read_parquet(paths.RSVD_SVA_DIAGNOSIS_PATH)
MELIOR_OVA = pd.read_parquet(paths.MELIOR_OVA_EPIKRIS_PATH)
MELIOR_SVA = pd.read_parquet(paths.MELIOR_SVA_EPIKRIS_PATH)
INFECTION_CODES = pd.read_excel(paths.INFECTION_CODES_PATH)['ICD-kod ID'].tolist()
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)
reference_data['patient_id'] = reference_data['patient_id'].astype(int)

## Generate Rename Columns Files

In [None]:
generate_and_save_rename_columns_json(
    RSVD_OVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_OVA_rename_columns.json"
)
generate_and_save_rename_columns_json(
    RSVD_SVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_SVA_rename_columns.json"
)

generate_and_save_rename_columns_json(
    MELIOR_OVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/Melior/Melior_OVA_rename_columns.json"
)
generate_and_save_rename_columns_json(
    MELIOR_SVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/Melior/Melior_SVA_rename_columns.json"
)

## Rename Columns

In [None]:
RSVD_OVA_renamed = rename_columns(
    RSVD_OVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_OVA_rename_columns.json"
)
RSVD_SVA_renamed = rename_columns(
    RSVD_SVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_SVA_rename_columns.json"
)

MELIOR_OVA_renamed = rename_columns(
    MELIOR_OVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/Melior/Melior_OVA_rename_columns.json"
)
MELIOR_SVA_renamed = rename_columns(
    MELIOR_SVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/Melior/Melior_SVA_rename_columns.json"
)

## Add Origin

In [None]:
RSVD_OVA_renamed["origin"] = "RSVD OVA"
RSVD_SVA_renamed["origin"] = "RSVD SVA"
MELIOR_OVA_renamed["origin"] = "Melior OVA"
MELIOR_SVA_renamed["origin"] = "Melior SVA"

## Concatenate Data

In [None]:
diagnosis_cleaner = EpisodeCleaner()
RSVD_combined = diagnosis_cleaner.concat_data(
    df1=RSVD_OVA_renamed, df2=RSVD_SVA_renamed
)
MELIOR_combined = diagnosis_cleaner.concat_data(
    df1=MELIOR_OVA_renamed, df2=MELIOR_SVA_renamed
)

## Generate Rename Values Files

In [None]:
generate_and_save_rename_values_json(
    df=RSVD_combined,
    file_path=f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_combined_rename_values.json",
    limit=100,
)
generate_and_save_rename_values_json(
    df=MELIOR_combined,
    file_path=f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/Melior/MELIOR_combined_rename_values.json",
    limit=100,
)

## Rename Values

In [None]:
RSVD_combined_renamed_values = rename_values(
    RSVD_combined, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_combined_rename_values.json"
)
MELIOR_combined_renamed_values = rename_values(
    MELIOR_combined, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/Melior/MELIOR_combined_rename_values.json"
)

## Clean Data

In [None]:
RSVD_cleaned = diagnosis_cleaner.clean_data(RSVD_combined_renamed_values)
RSVD_cleaned["hosp_id"] = -1
RSVD_cleaned['diagnosis_code'] = RSVD_cleaned['diagnosis_code'].str.replace('-', '', regex=False)

MELIOR_cleaned = diagnosis_cleaner.clean_data(MELIOR_combined_renamed_values)
MELIOR_cleaned['diagnosis_code'] = MELIOR_cleaned['diagnosis_code'].str.replace('-', '', regex=False)

## Add Patient ID to MELIOR Data

In [None]:
temp_ref= reference_data[reference_data.hosp_id.notnull()].copy()

MELIOR_cleaned_with_patient_id = pd.merge(
    temp_ref[["patient_id", "hosp_id"]].drop_duplicates(),
    MELIOR_cleaned,
    on="hosp_id",
    how="left",
)

## Combine RSVD and MELIOR Diagnosis Data

In [None]:
diagnosis = diagnosis_cleaner.concat_data(MELIOR_cleaned_with_patient_id, RSVD_cleaned)

In [None]:
diagnosis = diagnosis[diagnosis.diagnosis_code.notnull()]

## Pivot Data

In [None]:
diagnosis_pivoted = diagnosis_cleaner.pivot_data(
    df=diagnosis,
    index_cols=["patient_id", "origin", "diagnosis_date", "hosp_id"],
    pivot_cols="diagnosis_type",
    value_col="diagnosis_code",
)

In [None]:
# Välj vårdtillfällen med provtagning, plus marginal dagar för att inkludera fler ids
reference_data_filtered = reference_data[reference_data.sample_date.between(reference_data.hosp_start - pd.Timedelta(days=3), reference_data.hosp_stop + pd.Timedelta(days=3))].copy()

## Map Data to Episodes

### Diagnoser som är satta inom 365 dagar från provtagning

In [None]:


DAYS_BEFORE_BASELINE = pd.Timedelta(365, unit="days")
DAYS_AFTER_BASELINE = -pd.Timedelta(1, unit="days")

diagnosis_mapped = diagnosis_cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=diagnosis_pivoted,
    patient_id_col_name="patient_id",
    date_col_name="diagnosis_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)



### diagnoser som är satta under det aktuella vårdtillfället

In [None]:
diagnosis_infection_mapped = reference_data_filtered[['episode_id', 'patient_id', 'hosp_id', 'sample_id', 'sample_date']].merge(
    diagnosis_pivoted, how='left', on=['hosp_id', 'patient_id']).dropna(subset='hosp_id')

# TODO: Behöver kolla mer på detta om infektioner
diagnosis_infection_mapped = reference_data[['episode_id', 'patient_id']].drop_duplicates().merge(diagnosis_infection_mapped, how='left', on=['episode_id','patient_id'])

## Add indicators

In [None]:
diagnosis_infection_mapped["infection_diagnosis"] = (
    diagnosis_cleaner.get_prefix_match_mask(
        diagnosis_infection_mapped,
        ["main_diagnosis", "secondary_diagnosis"],
        INFECTION_CODES,
    )
)

diagnosis_mapped["cancer_diagnosis"] = diagnosis_cleaner.get_prefix_match_mask(
    diagnosis_mapped, ["main_diagnosis", "secondary_diagnosis"], ["C"]
)

diagnosis_mapped["diabetes_diagnosis"] = diagnosis_cleaner.get_prefix_match_mask(
    diagnosis_mapped,
    ["main_diagnosis", "secondary_diagnosis"],
    ["E10", "E11", "E12", "E13", "E14"],
)

## Create Summary

In [None]:
def summary_diagnosis_infection(df):
    episode_id = df["episode_id"].iloc[0]
    infection = df["infection_diagnosis"].max()
    if infection:
        df = df[df["infection_diagnosis"]].copy()
        infection_codes_main = list(set(df['main_diagnosis'].dropna().tolist()))
        infection_codes_secondary = list(set(df['secondary_diagnosis'].dropna().tolist()))
        
        infection_codes_main = ' | '.join(infection_codes_main)
        infection_codes_secondary = ' | '.join(infection_codes_secondary)
    else:
        infection_codes_main = None
        infection_codes_secondary = None
        infection = False

    return {
        "episode_id": episode_id,
        "infection_during_episode": infection,
        'infection_codes_main': infection_codes_main,
        'infection_codes_secondary': infection_codes_secondary,
    }


diagnosis_infection_during_hosp_summary = diagnosis_cleaner.summarize_data_by_episode(
    df=diagnosis_infection_mapped, episode_id_col="episode_id", summary_function=summary_diagnosis_infection
)

In [None]:
def summary_diagnosis(df):
    episode_id = df["episode_id"].iloc[0]

    cancer = df["cancer_diagnosis"].max()
    diabetes = df["diabetes_diagnosis"].max()

    diagnosis_codes_main = list(set(df['main_diagnosis'].dropna().tolist()))
    diagnosis_codes_secondary = list(set(df['secondary_diagnosis'].dropna().tolist()))
    diagnosis_codes_main = ' | '.join(diagnosis_codes_main)
    diagnosis_codes_secondary = ' | '.join(diagnosis_codes_secondary)

    return {
        "episode_id": episode_id,
        "cancer_within_365_days": cancer,
        "diabetes_within_365_days": diabetes,
        "diagnosis_codes_main_365_days_prior": diagnosis_codes_main,
        "diagnosis_codes_secondary_365_days_prior": diagnosis_codes_secondary,
    }


diagnosis_365_prior_summary = diagnosis_cleaner.summarize_data_by_episode(
    df=diagnosis_mapped, episode_id_col="episode_id", summary_function=summary_diagnosis
)

# Combine Summaries

In [None]:
diagnosis_combined = diagnosis_365_prior_summary.copy()
diagnosis_combined = diagnosis_combined.merge(
    diagnosis_infection_during_hosp_summary,
    on=["episode_id"],
    how="left"
)

## Save Data

In [None]:
if not os.path.exists(paths.STORE_DIAGNOSIS_DATA_PATH):
    os.makedirs(paths.STORE_DIAGNOSIS_DATA_PATH)
    os.makedirs(paths.STORE_DIAGNOSIS_DATA_PATH + "/mapped/")

diagnosis_mapped.to_parquet(f"{paths.STORE_DIAGNOSIS_DATA_PATH}/mapped/diagnosis_365_days_episode_mapped.parquet")
diagnosis_infection_mapped.to_parquet(f"{paths.STORE_DIAGNOSIS_DATA_PATH}/mapped/diagnosis_30_days_episode_mapped.parquet")

diagnosis_combined.to_parquet(f"{paths.STORE_DIAGNOSIS_DATA_PATH}/diagnosis_combined_summary.parquet")
