# Orbit Data Processing Pipeline

In [None]:
import sys
from pathlib import Path
from config import Config as paths
import pandas as pd
import os

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

from data_cleaning.renaming import generate_and_save_rename_columns_json, rename_columns
from data_cleaning.cleaners.episode.clean_data_Orbit import OrbitCleaner

## Load Data

In [None]:
orbit4_surgery = pd.read_parquet(paths.ORBIT4_PATH)
orbit5_surgery = pd.read_parquet(paths.ORBIT5_PATH)
orbit4_foreign_object_surgery = pd.read_parquet(paths.ORBIT4_FOREIGN_OBJECT_PATH)
orbit5_foreign_object_surgery = pd.read_parquet(paths.ORBIT5_FOREIGN_OBJECT_PATH)

## Generate Renaming Files

In [None]:
generate_and_save_rename_columns_json(orbit4_surgery, f"{paths.RENAME_FILES_ORBIT}/orbit4/orbit4_surgery_rename_columns.json")
generate_and_save_rename_columns_json(orbit5_surgery, f"{paths.RENAME_FILES_ORBIT}/orbit5/orbit5_surgery_rename_columns.json")
generate_and_save_rename_columns_json(orbit4_foreign_object_surgery, f"{paths.RENAME_FILES_ORBIT}/orbit4/orbit4_foreign_object_surgery_rename_columns.json")
generate_and_save_rename_columns_json(orbit5_foreign_object_surgery, f"{paths.RENAME_FILES_ORBIT}/orbit5/orbit5_foreign_object_surgery_rename_columns.json")


## Rename Columns

In [None]:
orbit4_surgery_renamed = rename_columns(
    orbit4_surgery,
    f"{paths.RENAME_FILES_ORBIT}/orbit4/orbit4_surgery_rename_columns.json",
)
orbit5_surgery_renamed = rename_columns(
    orbit5_surgery,
    f"{paths.RENAME_FILES_ORBIT}/orbit5/orbit5_surgery_rename_columns.json",
)
orbit4_foreign_object_surgery_renamed = rename_columns(
    orbit4_foreign_object_surgery,
    f"{paths.RENAME_FILES_ORBIT}/orbit4/orbit4_foreign_object_surgery_rename_columns.json",
)
orbit5_foreign_object_surgery_renamed = rename_columns(
    orbit5_foreign_object_surgery,
    f"{paths.RENAME_FILES_ORBIT}/orbit5/orbit5_foreign_object_surgery_rename_columns.json",
)

## Add Origin

In [None]:
orbit4_surgery_renamed['origin'] = 'orbit4'
orbit5_surgery_renamed['origin'] = 'orbit5'
orbit4_foreign_object_surgery_renamed['origin'] = 'orbit4 foreign object'
orbit5_foreign_object_surgery_renamed['origin'] = 'orbit5 foreign object'

## Combine Data

In [None]:
orbit_cleaner = OrbitCleaner()
orbit_surgery_combined = orbit_cleaner.concat_data(
    orbit4_surgery_renamed, orbit5_surgery_renamed
)
orbit_foreign_object_surgery_combined = orbit_cleaner.concat_data(
    orbit4_foreign_object_surgery_renamed, orbit5_foreign_object_surgery_renamed
)

## Clean Data

In [None]:
orbit_surgery_combined_cleaned = orbit_cleaner.clean_data(orbit_surgery_combined)
orbit_foreign_object_surgery_combined_cleaned = orbit_cleaner.clean_data(
    orbit_foreign_object_surgery_combined
)

## Map Data to Episodes

In [None]:
reference_data = pd.read_parquet(paths.REFERENCE_DATA_PATH)


DAYS_BEFORE_BASELINE= pd.Timedelta(180, unit="days")
DAYS_AFTER_BASELINE= -pd.Timedelta(1, unit="days")

orbit_surgery_mapped = orbit_cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=orbit_surgery_combined_cleaned,
    patient_id_col_name="patient_id",
    date_col_name="surgery_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)



DAYS_BEFORE_BASELINE= pd.Timedelta(99999, unit="days")
DAYS_AFTER_BASELINE= -pd.Timedelta(1, unit="days")

orbit_foreign_object_surgery_mapped = orbit_cleaner.map_data_to_interval(
    reference_df=reference_data[
        ["episode_id", "patient_id", "sample_date"]
    ].drop_duplicates(),
    df=orbit_foreign_object_surgery_combined_cleaned,
    patient_id_col_name="patient_id",
    date_col_name="surgery_date",
    baseline_col_name="sample_date",
    time_before_baseline=DAYS_BEFORE_BASELINE,
    time_after_baseline=DAYS_AFTER_BASELINE,
)

## Create Summary

In [None]:
orbit_surgery_mapped["surgery"] = orbit_cleaner.get_prefix_match_mask(
    df=orbit_surgery_mapped, target_cols=["surgery_code"], prefixes=[""]
)

orbit_foreign_object_surgery_mapped["surgery"] = (
    orbit_cleaner.get_prefix_match_mask(
        df=orbit_foreign_object_surgery_mapped,
        target_cols=["surgery_code"],
        prefixes=[""],
    )
)

In [None]:
def summary_surgery(df):
    episode_id = df["episode_id"].iloc[0]

    had_surgery = df["surgery"].max()
    if had_surgery:
        surgery_codes = list(set(df['surgery_code'].dropna().tolist()))
        surgery_codes = ' | '.join(surgery_codes)
    else:
        surgery_codes = None

    return {
        "episode_id": episode_id,
        "surgery": had_surgery,
        "surgery_code": surgery_codes
    }


orbit_surgery_summary = orbit_cleaner.summarize_data_by_episode(
    df=orbit_surgery_mapped,
    episode_id_col="episode_id",
    summary_function=summary_surgery,
)
orbit_foreign_object_summary = orbit_cleaner.summarize_data_by_episode(
    df=orbit_foreign_object_surgery_mapped,
    episode_id_col="episode_id",
    summary_function=summary_surgery,
)

In [None]:
orbit_surgery_combined = orbit_surgery_summary.merge(
    orbit_foreign_object_summary,
    on=["episode_id"],
    suffixes=("_180_days_prior", "_with_foreign_object_prior"),
)

## Save Data

In [None]:
if not os.path.exists(paths.STORE_ORBIT_DATA_PATH):
    os.makedirs(paths.STORE_ORBIT_DATA_PATH)

#orbit_surgery_summary.to_parquet(f"{paths.STORE_ORBIT_DATA_PATH}/orbit_surgery_summary.parquet")
#orbit_foreign_object_summary.to_parquet(f"{paths.STORE_ORBIT_DATA_PATH}/orbit_foreign_object_summary.parquet")
orbit_surgery_combined.to_parquet(f"{paths.STORE_ORBIT_DATA_PATH}/orbit_surgery_combined.parquet")