# Microbiology Data Processing Pipeline


In [None]:
import sys
import os
from config import Config as paths
from pathlib import Path

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

import pandas as pd
import matplotlib.pyplot as plt
from data_cleaning.cleaners.microbiology.clean_data_LIMS import LIMSCleaner
from data_cleaning.cleaners.microbiology.clean_data_wwBakt import WWBaktCleaner
from data_cleaning.cleaners.baseCleaner import BaseCleaner
from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    rename_columns,
    generate_and_save_rename_values_json,
    rename_values,
)

pd.set_option("display.max_columns", None)
base_cleaner = BaseCleaner()

## Global Variables for Paths


In [None]:
wwbakt = pd.read_parquet(paths.WWBAKT_PATH)
lims = pd.read_parquet(paths.LIMS_PATH)

In [None]:
wwbakt.loc[wwbakt.Avd == 'OB','Avd'] = 'OB0'

## Remove rows with only negative results

In [None]:
# Ta bort rader med endast negativa resultat.
wwbakt = wwbakt[~(wwbakt['TTD Result 1'].str.lower().str.startswith('neg',na=False) & wwbakt['TTD Result'].str.lower().str.startswith('neg',na=False))]
wwbakt = wwbakt[~wwbakt.BaText.str.contains('mikroskopi:',case=False,na=False)]
wwbakt = wwbakt[~wwbakt.BaText.str.contains('mikroskopi visar',case=False,na=False)]
wwbakt = wwbakt[~wwbakt.BaText.isna()]

## Add species


In [None]:
# Detta kommer att fixas sen när rename filerna är klara
lims_species = pd.read_parquet("../rename_files/temp_files/LIMS_rename_bacteria.parquet")
wwbakt_species = pd.read_parquet("../rename_files/temp_files/wwBakt_rename_bacteria.parquet")

In [None]:
lims = pd.merge(lims, lims_species, on="Mikroorganism", how="left")
wwbakt = pd.merge(wwbakt, wwbakt_species, on="BaText", how="left")

## Separate SIR Data


In [None]:
microbiology_cleaner = BaseCleaner()
wwbakt_cleaner = WWBaktCleaner()
lims_cleaner = LIMSCleaner()

wwbakt_data_casefinding, wwbakt_sir_data_long_format = wwbakt_cleaner.clean_wwBakt_data(
    wwbakt
)
lims_data_casefinding, lims_sir_data_long_format = lims_cleaner.clean_LIMS_data(lims)

## Generate Renaming Files For Columns


In [None]:
generate_and_save_rename_columns_json(
    wwbakt_data_casefinding, f"{paths.RENAME_DIR}/wwBakt/rename_columns.json"
)
generate_and_save_rename_columns_json(
    lims_data_casefinding, f"{paths.RENAME_DIR}/LIMS/rename_columns.json"
)
generate_and_save_rename_columns_json(
    wwbakt_sir_data_long_format, f"{paths.RENAME_DIR}/wwBakt/sir_rename_columns.json"
)
generate_and_save_rename_columns_json(
    lims_sir_data_long_format, f"{paths.RENAME_DIR}/LIMS/sir_rename_columns.json"
)

## Rename Columns


In [None]:
wwbakt_data_casefinding_renamed = rename_columns(
    wwbakt_data_casefinding, f"{paths.RENAME_DIR}/wwBakt/rename_columns.json"
)
lims_data_casefinding_renamed = rename_columns(
    lims_data_casefinding, f"{paths.RENAME_DIR}/LIMS/rename_columns.json"
)
wwbakt_sir_data_long_format_renamed = rename_columns(
    wwbakt_sir_data_long_format, f"{paths.RENAME_DIR}/wwBakt/sir_rename_columns.json"
)
lims_sir_data_long_format_renamed = rename_columns(
    lims_sir_data_long_format, f"{paths.RENAME_DIR}/LIMS/sir_rename_columns.json"
)

## Combine and Clean Data


In [None]:
casefinding_combined = base_cleaner.concat_data(
    wwbakt_data_casefinding_renamed, lims_data_casefinding_renamed
)
sir_data_combined = base_cleaner.concat_data(
    wwbakt_sir_data_long_format_renamed, lims_sir_data_long_format_renamed
)

casefinding_cleaned = base_cleaner.clean_data(casefinding_combined)
# finns ett labnummer som finns i både wwBakt och LIMS
sir_data_cleaned = base_cleaner.clean_data(sir_data_combined)

## Keep only blood cultures

In [None]:
casefinding_only_blood = wwbakt_cleaner.extract_blood_samples(
    df=casefinding_cleaned, variable_name="examination", keyword="blododling"
)
casefinding_only_blood = wwbakt_cleaner.extract_blood_samples(
    df=casefinding_only_blood, variable_name="sample_material", keyword="blod"
)

## Add Sample times

In [None]:
# finns labnr som har flera rader
new_times = lims_cleaner.clean_data(pd.read_parquet(paths.NEW_TIMES_Path)).sort_values("sample_datetime").drop_duplicates("labnr")
casefinding_cleaned = pd.merge(casefinding_only_blood,new_times,how='left',on='labnr')

In [None]:
print('labnr',casefinding_cleaned.labnr.nunique())
print('patienter',casefinding_cleaned.patient_id.nunique())
print('rader',casefinding_cleaned.shape[0])

## Remove negativ

In [None]:
casefinding_cleaned = casefinding_cleaned[~casefinding_cleaned.bottle_outcome.str.contains('neg',case=False,na=False)]

In [None]:
print('labnr',casefinding_cleaned.labnr.nunique())
print('patienter',casefinding_cleaned.patient_id.nunique())
print('rader',casefinding_cleaned.shape[0])

## Add potential contaminants


In [None]:
contaminants = pd.read_excel("../rename_files/temp_files/species_list.xlsx")
contaminants = contaminants[
    ["species", "potential_contaminant", "genus", "category_1", "category_2","gram","anaerobe"]
].drop_duplicates()

contaminants["species"] = contaminants["species"].str.lower()
contaminants["genus"] = contaminants["genus"].str.lower()
contaminants["category_1"] = contaminants["category_1"].str.lower()
contaminants["category_2"] = contaminants["category_2"].str.lower()
contaminants["gram"] = contaminants["gram"].str.lower()
contaminants["potential_contaminant"] = contaminants["potential_contaminant"].astype(bool)



In [None]:
casefinding_cleaned['species'] = casefinding_cleaned['species'].str.lower()
casefinding_cleaned_with_bacterial_class = pd.merge(
    casefinding_cleaned, contaminants.drop_duplicates(), on="species", how="left"
)


casefinding_cleaned_with_bacterial_class["potential_contaminant"] = casefinding_cleaned_with_bacterial_class[
    "potential_contaminant"
].fillna(False)


In [None]:
print('labnr',casefinding_cleaned_with_bacterial_class.labnr.nunique())
print('patienter',casefinding_cleaned_with_bacterial_class.patient_id.nunique())
print('rader',casefinding_cleaned_with_bacterial_class.shape[0])

## Generate Rename File for Values


In [None]:
generate_and_save_rename_values_json(
    df=casefinding_cleaned_with_bacterial_class,
    file_path=f"{paths.RENAME_DIR}/microbiology/rename_values.json",
    limit=410,
)
generate_and_save_rename_values_json(
    df=casefinding_cleaned_with_bacterial_class,
    file_path=f"{paths.RENAME_DIR}/microbiology/sir_rename_values.json",
    limit=600,
)

## Rename Values


In [None]:
casefinding_cleaned_renamed = rename_values(
    casefinding_cleaned_with_bacterial_class, f"{paths.RENAME_DIR}/microbiology/rename_values.json"
)
sir_data_cleaned_renamed = rename_values(
    sir_data_cleaned, f"{paths.RENAME_DIR}/microbiology/sir_rename_values.json"
).drop_duplicates()

## Only keep data sampled on 31st of may 2013 and after

In [None]:

""" casefinding_cleaned_renamed = casefinding_cleaned_renamed[
    casefinding_cleaned_renamed["sample_date"] >= pd.to_datetime("2013-05-31")
] """

## Classify contaminants


In [None]:
casefinding_cleaned_renamed.loc[casefinding_cleaned_renamed.bottle_outcome.isna(),'bottle_outcome'] = 'pos'

In [None]:
casefinding_microbiological_classified = wwbakt_cleaner.classify_microbiological_findings(
    casefinding_cleaned_renamed,
    method="labnr",
    outcome_col="bottle_outcome",
    outcome_positive_prefix="pos",
    patient_id_col="patient_id",
    sample_date_col="sample_date",
    species_col="microorganism",
    labnr_col="labnr",
    potential_contaminant_col="potential_contaminant",
)

In [None]:
print('labnr',casefinding_microbiological_classified.labnr.nunique())
print('patienter',casefinding_microbiological_classified.patient_id.nunique())
print('rader',casefinding_microbiological_classified.shape[0])
casefinding_microbiological_classified

## Add TTP Column


In [None]:
casefinding_with_ttp = wwbakt_cleaner.add_ttp(
    df=casefinding_microbiological_classified,
    result_col_name="bottle_outcome",
    incubation_date_col_name="incubation_date",
    ttd_col_name="TTD",
    result_date_col_name="bottle_report_date",
)

In [None]:
print('labnr',casefinding_with_ttp.labnr.nunique())
print('patienter',casefinding_with_ttp.patient_id.nunique())
print('rader',casefinding_with_ttp.shape[0])

## Filter


In [None]:
test1 = set(casefinding_with_ttp.patient_id.unique().tolist())

def filter_microbiology(
    df: pd.DataFrame,
    remove_negative_samples: bool = True,
    remove_contaminants: bool = True,
):

    df = df.copy()
    if remove_negative_samples:
        df = df[df["bottle_outcome"].str.lower().str.startswith("pos", na=False)]

    if remove_contaminants:
        df = df[df["mono_poly_contamination"] != "cont"]

    return df


casefinding_filtered = filter_microbiology(
    casefinding_with_ttp, remove_negative_samples=True, remove_contaminants=True
)

test2 = set(casefinding_filtered.patient_id.unique().tolist())

print('labnr',casefinding_filtered.labnr.nunique())
print('patienter',casefinding_filtered.patient_id.nunique())
print('rader',casefinding_filtered.shape[0])

## Group into Episodes

In [None]:
casefinding_with_episodes = wwbakt_cleaner.determine_episode(
    df=casefinding_filtered,
    columns_to_sort_by=["patient_id", "sample_date"],
    patient_id_col_name="patient_id",
    sample_date_col_name="sample_date",
    time=30,
)

In [None]:
print('sid: ', casefinding_with_episodes.labnr.nunique())
print('patient_ids: ', casefinding_with_episodes.patient_id.nunique())
print('rader: ', casefinding_with_episodes.shape[0])
print('episode id',casefinding_with_episodes.episode_id.nunique())




## Add time to sample dates

In [None]:
mask = casefinding_with_episodes["sample_datetime"].notna()
casefinding_with_episodes.loc[mask, "sample_date"] = casefinding_with_episodes.loc[mask, "sample_datetime"]

## Deduplicate Data


In [None]:
def deduplicate_microbiological(df, keep_only_index_day: bool = True):
    df = df.copy()

    if keep_only_index_day:

        # set index_day == True for samples taken on the first day of the episode
        df["sample_taken_on_index_day"] = df["sample_date"] == df.groupby(
            ["episode_id", "patient_id"]
        )["sample_date"].transform("min")
        
        # filter to only keep samples taken on the index day
        df = df[df["sample_taken_on_index_day"] == True]
        
        # update columns to reflect if the finding is polymicrobial or mono
        df.loc[df.mono_poly_contamination == 'poly', ['species','category_1','category_2','genus','bacterial_class']] = ['polymicrobial','polymicrobial','polymicrobial','polymicrobial','polymicrobial']

        # keep the row with the shortest TTP
        df = df.sort_values(by=["TTP_hours"], ascending=True)
        df = df.drop_duplicates(
            subset=["episode_id"], keep="first"
        )

    return df


casefinding_with_episodes_dedub = deduplicate_microbiological(
    df=casefinding_with_episodes
)

In [None]:
print('sid: ', casefinding_with_episodes_dedub.labnr.nunique())
print('patient_ids: ', casefinding_with_episodes_dedub.patient_id.nunique())
print('episode id',casefinding_with_episodes_dedub.episode_id.nunique())
print('rader: ', casefinding_with_episodes_dedub.shape[0])

## Add SIR data
# TODO
- Tänker att man kan ha tre olika kolumner t.ex. SIR_S, SIR_I och SIR_R
- För varje rad i kolumnerna så skapar vi en sträng med följande format:
    species 1: antibiotika 1 | antibiotika 2 | ... \n
    species 2: antibiotika 1 | antibiotika 2 | ... 
    ...

## Save Processed microbiology


In [None]:
if not os.path.exists(paths.STORE_MICROBIOLOGY_PATH):
    os.makedirs(paths.STORE_MICROBIOLOGY_PATH)

casefinding_with_episodes_dedub.to_parquet(
    f"{paths.STORE_MICROBIOLOGY_PATH}/microbiology_dedub.parquet",
)
casefinding_with_episodes.to_parquet(
    f"{paths.STORE_MICROBIOLOGY_PATH}/microbiology_without_contaminants.parquet",
)
sir_data_cleaned_renamed.to_parquet(f"{paths.STORE_MICROBIOLOGY_PATH}/sir.parquet")