# Comorbidities Data Processing Pipeline

In [None]:
import sys
from pathlib import Path
from config import Config as paths
import pandas as pd
import sys
import os

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    rename_columns,
    generate_and_save_rename_values_json,
    rename_values,
)
from data_cleaning.utils import save_json, load_json
from data_cleaning.cleaners.episode.episodeCleaner import EpisodeCleaner

## Read Data

In [None]:
RSVD_OVA = pd.read_parquet(paths.RSVD_OVA_DIAGNOSIS_PATH)
RSVD_SVA = pd.read_parquet(paths.RSVD_SVA_DIAGNOSIS_PATH)
RSVD_CANCER = pd.read_parquet(paths.RSVD_CANCER_DIAGNOSIS_PATH)
INFECTION_CODES = pd.read_excel(paths.INFECTION_CODES_PATH)['ICD-kod ID'].tolist()
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)
reference_data['patient_id'] = reference_data['patient_id'].astype(int)

## Generate Rename Columns Files

In [None]:
generate_and_save_rename_columns_json(
    RSVD_OVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_OVA_rename_columns.json"
)
generate_and_save_rename_columns_json(
    RSVD_SVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_SVA_rename_columns.json"
)
generate_and_save_rename_columns_json(
    RSVD_CANCER, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_CANCER_rename_columns.json"
)

## Rename Columns

In [None]:
RSVD_OVA_renamed = rename_columns(
    RSVD_OVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_OVA_rename_columns.json"
)
RSVD_SVA_renamed = rename_columns(
    RSVD_SVA, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_SVA_rename_columns.json"
)
RSVD_CANCER_renamed = rename_columns(
    RSVD_CANCER, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_CANCER_rename_columns.json"
)

## Add Origin

In [None]:
RSVD_OVA_renamed["origin"] = "RSVD OVA"
RSVD_SVA_renamed["origin"] = "RSVD SVA"
RSVD_CANCER_renamed["origin"] = "RSVD Cancer"

## Concatenate Data

In [None]:
diagnosis_cleaner = EpisodeCleaner()
RSVD_combined = diagnosis_cleaner.concat_data(
    df1=RSVD_OVA_renamed, df2=RSVD_SVA_renamed
)

## Generate Rename Values Files

In [None]:
generate_and_save_rename_values_json(
    df=RSVD_combined,
    file_path=f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_combined_rename_values.json",
    limit=100,
)

generate_and_save_rename_values_json(
    df=RSVD_CANCER_renamed,
    file_path=f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_CANCER_rename_values.json",
    limit=100,
)

## Rename Values

In [None]:
RSVD_combined_renamed_values = rename_values(
    RSVD_combined, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_combined_rename_values.json"
)

RSVD_CANCER_renamed_values = rename_values(
    RSVD_CANCER_renamed, f"{paths.RENAME_FILES_PATH_DIAGNOSIS}/RSVD/RSVD_CANCER_rename_values.json"
)

## Clean Data

In [None]:
RSVD_cleaned = diagnosis_cleaner.clean_data(RSVD_combined_renamed_values)
RSVD_cleaned['diagnosis_code'] = RSVD_cleaned['diagnosis_code'].str.replace('-', '', regex=False)


RSVD_CANCER_cleaned = diagnosis_cleaner.clean_data(RSVD_CANCER_renamed_values)
RSVD_CANCER_cleaned['diagnosis_code'] = RSVD_CANCER_cleaned['diagnosis_code'].str.replace('-', '', regex=False)

## Combine RSVD and MELIOR Diagnosis Data

In [None]:
diagnosis = pd.concat([RSVD_CANCER_cleaned, RSVD_cleaned])

In [None]:
diagnosis = diagnosis[diagnosis.diagnosis_code.notnull()]

## Map Data to Episodes

### Diagnoser som är satta inom 365 dagar från provtagning

In [None]:


DAYS_BEFORE_BASELINE = pd.Timedelta(365, unit="days")
DAYS_AFTER_BASELINE = -pd.Timedelta(1, unit="days")

diagnosis_mapped_365_days = diagnosis_cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=diagnosis,
    patient_id_col_name="patient_id",
    date_col_name="diagnosis_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)

diagnosis_mapped_365_days = diagnosis_mapped_365_days.dropna(subset='diagnosis_code')


In [None]:
icd_codes = {
    "Malignancy": (["C", "Z511"], []),
    "Breast_cancer": (["C50"], []),
    "Skin_cancer": (["C43", "C44"], []),
    "Prostate_cancer": (["C61"], []),
    "Colorectal_cancer": (["C18", "C19", "C20", "C21"], []),
    "Lung_cancer": (["C34"], []),
    "Urinary_tract_cancer": (["C64", "C65", "C66", "C67", "C68"], []),
    "Blood_cancer": (
        [
            "C81",
            "C82",
            "C83",
            "C84",
            "C85",
            "C86",
            "C87",
            "C88",
            "C89",
            "C90",
            "C91",
            "C92",
            "C93",
            "C94",
            "C95",
            "C96",
        ],
        [],
    ),
    "Other_cancer": (
        ["C"],
        [
            "C50",
            "C43",
            "C44",
            "C61",
            "C18",
            "C19",
            "C20",
            "C21",
            "C34",
            "C64",
            "C65",
            "C66",
            "C67",
            "C68",
            "C81",
            "C82",
            "C83",
            "C84",
            "C85",
            "C86",
            "C87",
            "C88",
            "C89",
            "C90",
            "C91",
            "C92",
            "C93",
            "C94",
            "C95",
            "C96",
        ],
    ),
    "Anemia": (["D5", "D60", "D61", "D62", "D63", "D64"], []),
    "Immunosuppression": (
        [
            "B20",
            "B21",
            "B22",
            "B23",
            "B24",
            "D70",
            "D80",
            "D81",
            "D82",
            "D83",
            "D84",
            "Z94",
        ],
        [],
    ),
    "Diabetes": (["E10", "E11", "E12", "E13", "E14"], []),
    "Psychiatric_disorder": (["F"], []),
    "Neurologic_disease": (["G", "I6"], ["G0", "G47"]),
    "Hypertension": (["I10", "I11", "I12", "I13", "I14", "I15"], []),
    "Cardiac_disease": (
        [
            "I05",
            "I06",
            "I07",
            "I08",
            "I09",
            "I20",
            "I21",
            "I22",
            "I23",
            "I24",
            "I25",
            "I34",
            "I35",
            "I36",
            "I37",
            "I42",
            "I43",
            "I44",
            "I45",
            "I47",
            "I48",
            "I49",
            "I50",
            "Z95",
        ],
        [],
    ),
    "Peripheral_vascular_disease": (["I7", "I8"], []),
    "Pulmonary_disease": (["I26", "I27", "I28", "J4", "J84"], []),
    "Hepatic_disease": (["B1", "K7"], []),
    "Skin_disease": (["L"], ["L0"]),
    "Musculoskeletal_disease": (
        ["M"],
        ["M01", "M02", "M03", "M549", "M545", "M791", "M796", "M81"],
    ),
    "Genitourinary_disease": (
        ["N0", "N1", "N2", "N3", "N4", "Z49"],
        ["N10", "N17", "N30", "N39", "N136"],
    ),
    "Medical": (
        [
            "K50",
            "K51",
            "K52",
            "K53",
            "K54",
            "K58",
            "K59",
            "K20",
            "K21",
            "K22",
            "K23",
            "K29",
            "K30",
            "K31",
            "K90",
            "K91",
            "K92",
            "K93",
            "K94",
            "K95",
            "K25",
            "K26",
            "K27",
            "K28",
        ],
        [],
    ),
    "Surgical": (
        [
            "K60",
            "K61",
            "K62",
            "K63",
            "K55",
            "K56",
            "K57",
            "K44",
            "K35",
            "K36",
            "K37",
            "K40",
            "K41",
            "K42",
            "K43",
            "K45",
            "K46",
            "K80",
            "K81",
            "K82",
            "K83",
            "K84",
            "K85",
            "K86",
            "K87",
            "K222",
            "K316",
        ],
        [],
    ),
    "Thrombosis": (['T828','T858','I829'],[]),
    "Coagulation_disorders": (['D65', 'D68', 'D69'],[]),
    "Previous_vascular_complications": (['T82','T85'],[])
}

In [None]:
diagnosis_mapped_365_days.head()

In [None]:
def assign_group_icd(index_cols, icd_dict, diagnosis_df, patients_df, code_col):

    dx = diagnosis_df.copy()
    dx[code_col] = dx[code_col].astype(str).str.upper().str.strip()

    for category, (include_code, exclude_code) in icd_dict.items():
        if include_code:
            dx[category] = dx[code_col].str.startswith(tuple(include_code))
        else:
            dx[category] = False

        if exclude_code:
            dx[category + "_exclude"] = dx[code_col].str.startswith(tuple(exclude_code))
        else:
            dx[category + "_exclude"] = False

       # om exclude == True så ska category sättas till False
        dx[category] = dx[category] & ~dx[category + "_exclude"]

        
        dx = dx.drop(columns=[category + "_exclude"])

    # Aggregate to patient level (any True -> True)
    diagnosis_wide = (
        dx.pivot_table(index=index_cols, values=list(icd_dict.keys()), aggfunc="max")
        .reset_index()
    )

    # Merge onto full patient list and fill missing with False
    out = patients_df.merge(diagnosis_wide, on=index_cols, how="left")
    out[list(icd_dict.keys())] = out[list(icd_dict.keys())].fillna(False).astype(bool)

    return out



patients = reference_data[['episode_id']].drop_duplicates().reset_index(drop=True)

cci_365_days_before = assign_group_icd(
    index_cols=["episode_id"],
    icd_dict=icd_codes,
    diagnosis_df=diagnosis_mapped_365_days,
    patients_df=patients,
    code_col="diagnosis_code"
)

cci_365_days_before.columns = ['episode_id'] + ['c_' + col for col in cci_365_days_before.columns if col not in ['episode_id']]

In [None]:
if not os.path.exists(paths.STORE_DIAGNOSIS_DATA_PATH):
    os.makedirs(paths.STORE_DIAGNOSIS_DATA_PATH)
    os.makedirs(paths.STORE_DIAGNOSIS_DATA_PATH + "/mapped/")



cci_365_days_before.to_parquet(paths.STORE_DIAGNOSIS_DATA_PATH + '/cci.parquet')
