# Microbiology Data Processing Pipeline


In [1]:
import sys
import os
from config import Config as paths
from pathlib import Path

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

import pandas as pd
import matplotlib.pyplot as plt
from data_cleaning.cleaners.microbiology.clean_data_LIMS import LIMSCleaner
from data_cleaning.cleaners.microbiology.clean_data_wwBakt import WWBaktCleaner
from data_cleaning.cleaners.baseCleaner import BaseCleaner
from data_cleaning.renaming import (
    generate_and_save_rename_columns_json,
    rename_columns,
    generate_and_save_rename_values_json,
    rename_values,
)

pd.set_option("display.max_columns", None)
base_cleaner = BaseCleaner()

## Global Variables for Paths


In [2]:
wwbakt = pd.read_parquet(paths.WWBAKT_PATH)
lims = pd.read_parquet(paths.LIMS_PATH)

In [3]:
wwbakt.loc[wwbakt.Avd == 'OB','Avd'] = 'OB0'

## Remove rows with only negative results

In [4]:
# Ta bort rader med endast negativa resultat.
wwbakt = wwbakt[~(wwbakt['TTD Result 1'].str.lower().str.startswith('neg',na=False) & wwbakt['TTD Result'].str.lower().str.startswith('neg',na=False))]
wwbakt = wwbakt[~wwbakt.BaText.str.contains('mikroskopi:',case=False,na=False)]
wwbakt = wwbakt[~wwbakt.BaText.str.contains('mikroskopi visar',case=False,na=False)]
wwbakt = wwbakt[~wwbakt.BaText.isna()]

## Add species


In [5]:
# Detta kommer att fixas sen när rename filerna är klara
lims_species = pd.read_parquet("../rename_files/temp_files/LIMS_rename_bacteria.parquet")
wwbakt_species = pd.read_parquet("../rename_files/temp_files/wwBakt_rename_bacteria.parquet")

In [6]:
lims = pd.merge(lims, lims_species, on="Mikroorganism", how="left")
wwbakt = pd.merge(wwbakt, wwbakt_species, on="BaText", how="left")

## Separate SIR Data


In [7]:
microbiology_cleaner = BaseCleaner()
wwbakt_cleaner = WWBaktCleaner()
lims_cleaner = LIMSCleaner()

wwbakt_data_casefinding, wwbakt_sir_data_long_format = wwbakt_cleaner.clean_wwBakt_data(
    wwbakt
)
lims_data_casefinding, lims_sir_data_long_format = lims_cleaner.clean_LIMS_data(lims)

## Generate Renaming Files For Columns


In [8]:
generate_and_save_rename_columns_json(
    wwbakt_data_casefinding, f"{paths.RENAME_DIR}/wwBakt/rename_columns.json"
)
generate_and_save_rename_columns_json(
    lims_data_casefinding, f"{paths.RENAME_DIR}/LIMS/rename_columns.json"
)
generate_and_save_rename_columns_json(
    wwbakt_sir_data_long_format, f"{paths.RENAME_DIR}/wwBakt/sir_rename_columns.json"
)
generate_and_save_rename_columns_json(
    lims_sir_data_long_format, f"{paths.RENAME_DIR}/LIMS/sir_rename_columns.json"
)

File ../rename_files/microbiology/wwBakt/rename_columns.json already exists.
File ../rename_files/microbiology/LIMS/rename_columns.json already exists.
File ../rename_files/microbiology/wwBakt/sir_rename_columns.json already exists.
File ../rename_files/microbiology/LIMS/sir_rename_columns.json already exists.


## Rename Columns


In [9]:
wwbakt_data_casefinding_renamed = rename_columns(
    wwbakt_data_casefinding, f"{paths.RENAME_DIR}/wwBakt/rename_columns.json"
)
lims_data_casefinding_renamed = rename_columns(
    lims_data_casefinding, f"{paths.RENAME_DIR}/LIMS/rename_columns.json"
)
wwbakt_sir_data_long_format_renamed = rename_columns(
    wwbakt_sir_data_long_format, f"{paths.RENAME_DIR}/wwBakt/sir_rename_columns.json"
)
lims_sir_data_long_format_renamed = rename_columns(
    lims_sir_data_long_format, f"{paths.RENAME_DIR}/LIMS/sir_rename_columns.json"
)

## Combine and Clean Data


In [10]:
casefinding_combined = base_cleaner.concat_data(
    wwbakt_data_casefinding_renamed, lims_data_casefinding_renamed
)
sir_data_combined = base_cleaner.concat_data(
    wwbakt_sir_data_long_format_renamed, lims_sir_data_long_format_renamed
)

casefinding_cleaned = base_cleaner.clean_data(casefinding_combined)
# finns ett labnummer som finns i både wwBakt och LIMS
sir_data_cleaned = base_cleaner.clean_data(sir_data_combined)

Column 'sample_date' successfully converted to datetime.
Column 'incubation_date' successfully converted to datetime.
Column 'result_date' successfully converted to datetime.
Column 'Analys' successfully converted to datetime.
Column 'Resultat' successfully converted to datetime.
Column 'bottle_start_date' successfully converted to datetime.
Column 'bottle_report_date' successfully converted to datetime.
Column 'sample_date' successfully converted to datetime.


## Keep only blood cultures

In [11]:
casefinding_only_blood = wwbakt_cleaner.extract_blood_samples(
    df=casefinding_cleaned, variable_name="examination", keyword="blododling"
)
casefinding_only_blood = wwbakt_cleaner.extract_blood_samples(
    df=casefinding_only_blood, variable_name="sample_material", keyword="blod"
)

## Add Sample times

In [12]:
# finns labnr som har flera rader
new_times = lims_cleaner.clean_data(pd.read_parquet(paths.NEW_TIMES_Path)).sort_values("sample_datetime").drop_duplicates("labnr")
casefinding_cleaned = pd.merge(casefinding_only_blood,new_times,how='left',on='labnr')

Column 'sample_datetime' successfully converted to datetime.
Column 'sample_arrival_datetime' successfully converted to datetime.


In [13]:
print('labnr',casefinding_cleaned.labnr.nunique())
print('patienter',casefinding_cleaned.patient_id.nunique())
print('rader',casefinding_cleaned.shape[0])

labnr 124298
patienter 58991
rader 267868


## Remove negativ

In [14]:
casefinding_cleaned = casefinding_cleaned[~casefinding_cleaned.bottle_outcome.str.contains('neg',case=False,na=False)]

In [15]:
print('labnr',casefinding_cleaned.labnr.nunique())
print('patienter',casefinding_cleaned.patient_id.nunique())
print('rader',casefinding_cleaned.shape[0])

labnr 124298
patienter 58991
rader 241590


## Add potential contaminants


In [16]:
contaminants = pd.read_excel("../rename_files/temp_files/species_list.xlsx")
contaminants = contaminants[
    ["species", "potential_contaminant", "genus", "category_1", "category_2","gram","anaerobe"]
].drop_duplicates()

contaminants["species"] = contaminants["species"].str.lower()
contaminants["genus"] = contaminants["genus"].str.lower()
contaminants["category_1"] = contaminants["category_1"].str.lower()
contaminants["category_2"] = contaminants["category_2"].str.lower()
contaminants["gram"] = contaminants["gram"].str.lower()
contaminants["potential_contaminant"] = contaminants["potential_contaminant"].astype(bool)



In [17]:
casefinding_cleaned['species'] = casefinding_cleaned['species'].str.lower()
casefinding_cleaned_with_bacterial_class = pd.merge(
    casefinding_cleaned, contaminants.drop_duplicates(), on="species", how="left"
)


casefinding_cleaned_with_bacterial_class["potential_contaminant"] = casefinding_cleaned_with_bacterial_class[
    "potential_contaminant"
].fillna(False)


  ].fillna(False)


In [18]:
print('labnr',casefinding_cleaned_with_bacterial_class.labnr.nunique())
print('patienter',casefinding_cleaned_with_bacterial_class.patient_id.nunique())
print('rader',casefinding_cleaned_with_bacterial_class.shape[0])

labnr 124298
patienter 58991
rader 241590


## Generate Rename File for Values


In [19]:
generate_and_save_rename_values_json(
    df=casefinding_cleaned_with_bacterial_class,
    file_path=f"{paths.RENAME_DIR}/microbiology/rename_values.json",
    limit=410,
)
generate_and_save_rename_values_json(
    df=casefinding_cleaned_with_bacterial_class,
    file_path=f"{paths.RENAME_DIR}/microbiology/sir_rename_values.json",
    limit=600,
)

File ../rename_files/microbiology/microbiology/rename_values.json already exists.
File ../rename_files/microbiology/microbiology/sir_rename_values.json already exists.


## Rename Values


In [20]:
casefinding_cleaned_renamed = rename_values(
    casefinding_cleaned_with_bacterial_class, f"{paths.RENAME_DIR}/microbiology/rename_values.json"
)
sir_data_cleaned_renamed = rename_values(
    sir_data_cleaned, f"{paths.RENAME_DIR}/microbiology/sir_rename_values.json"
).drop_duplicates()

## Only keep data sampled on 31st of may 2013 and after

In [21]:

""" casefinding_cleaned_renamed = casefinding_cleaned_renamed[
    casefinding_cleaned_renamed["sample_date"] >= pd.to_datetime("2013-05-31")
] """

' casefinding_cleaned_renamed = casefinding_cleaned_renamed[\n    casefinding_cleaned_renamed["sample_date"] >= pd.to_datetime("2013-05-31")\n] '

## Classify contaminants


In [22]:
casefinding_cleaned_renamed.loc[casefinding_cleaned_renamed.bottle_outcome.isna(),'bottle_outcome'] = 'pos'

In [23]:
casefinding_microbiological_classified = wwbakt_cleaner.classify_microbiological_findings(
    casefinding_cleaned_renamed,
    method="labnr",
    outcome_col="bottle_outcome",
    outcome_positive_prefix="pos",
    patient_id_col="patient_id",
    sample_date_col="sample_date",
    species_col="microorganism",
    labnr_col="labnr",
    potential_contaminant_col="potential_contaminant",
)

  df["relevant"] = df["relevant"].fillna(True)


In [24]:
print('labnr',casefinding_microbiological_classified.labnr.nunique())
print('patienter',casefinding_microbiological_classified.patient_id.nunique())
print('rader',casefinding_microbiological_classified.shape[0])
casefinding_microbiological_classified

labnr 124298
patienter 58991
rader 241590


Unnamed: 0,sample_id,patient_id,year,section,section_code,sample_date,incubation_date,result_date,sex,age,material_code,sample_material,examination,Qtnr,Analys,Resultat,Banr,microorganism,Pbn,bottle_media,bottle_start_date,bottle_report_date,TTD,bottle_outcome,department_code,department,hospital,species,labnr,bottle_nr,data_source,lid,analysis_code,analysis_name,bottle_lid,sample_datetime,sample_arrival_datetime,potential_contaminant,genus,category_1,category_2,gram,anaerobe,relevant,nr_relevant_findings,nr_non_relevant_findings,mono_poly_contamination,polymicrobial,which_polymicrobial,which_sample_ids
0,20147320,1028153,2019,OBL,514053,2019-08-23,2019-08-23,2019-08-29,male,65,65012,blood,blood_culture,,NaT,NaT,41000,Candida albicans,41000-1,Aerobic Plus,2019-08-23 10:22:00,2019-08-25 17:39:00,2 days 07:18:00,positive,LUACME,Akutmottagning Lund LUND,LUND,candida albicans,19OBL514053,bottle 1,wwBakt,,,,,2019-08-23 10:05:00,2019-08-23 00:00:00,False,candida species,fungal,fungal,fungal,,True,1.0,0.0,mono,False,,candida albicans:20018410 | candida albicans:2...
1,20067416,1045995,2018,OBH,509141,2018-06-30,2018-07-02,2018-07-03,male,64,65012,blood,blood_culture,,NaT,NaT,10200,Staphylococcus aureus,10200-1,Aerobic Plus,2018-07-01 11:43:00,2018-07-01 17:39:00,0 days 05:54:00,positive,HGORAC,Akutmottagning Grön disk HELSINGBORG,HELSINGBORG,staphylococcus aureus,18OBH509141,bottle 1,wwBakt,,,,,2018-06-30 11:30:00,2018-07-02 00:00:00,False,staphylococcus aureus,staphylococcus aureus,staphylococcus aureus,gram_positive,Aerobic,True,1.0,0.0,mono,False,,staphylococcus aureus:20067416 | staphylococcu...
2,20264082,1017781,2019,OBL,517683,2019-10-22,2019-10-22,2019-11-07,female,46,65012,blood,blood_culture,,NaT,NaT,22600,Brucella melitensis,22600-1,Aerobic Plus,2019-10-22 16:40:00,2019-10-25 03:44:00,2 days 11:06:00,positive,LUINNN,Inf klin mott LUND,LUND,brucella melitensis,19OBL517683,bottle 1,wwBakt,,,,,2019-10-22 00:00:00,2019-10-22 00:00:00,False,brucella species,other,other,gram_negative,Aerobic,True,1.0,0.0,mono,False,,brucella melitensis:20159246 | brucella melite...
3,20267737,1039037,2012,OBH,506550,2012-06-13,2012-06-13,2012-06-15,female,85,65012,blood,blood_culture,,NaT,NaT,10390,Staphylococcus species (KNS),10390-1,,NaT,NaT,NaT,pos,HGMEAC,Akutmottagning Röd disk HELSINGBORG,HELSINGBORG,staphylococcus species,12OBH506550,bottle 1,wwBakt,,,,,2012-06-13 06:30:00,2012-06-13 00:00:00,True,staphylococcus species,staphylococcus species,staphylococcus species,gram_positive,Aerobic,True,1.0,0.0,mono,False,,staphylococcus species:20139892 | staphylococc...
4,20155218,1033300,2011,OBK,503260,2011-04-30,2011-05-01,2011-05-09,female,71,65012,blood,blood_culture,,NaT,NaT,81510,Anaeroba gramnegativa stavar,81510-1,,NaT,NaT,NaT,pos,KDKIKM,Kir klin mottagning KRISTIANSTAD,KRISTIANSTAD,anaerobic gramnegative rods,11OBK503260,bottle 1,wwBakt,,,,,2011-04-30 22:00:00,2011-05-01 00:00:00,False,anaerobic species,other,other,gram_negative,Anaerobic,True,1.0,0.0,mono,False,,anaerobic gramnegative rods:20155218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241585,20020799,1053489,2023,,,2023-04-29,2023-04-29,2023-05-02,male,66,,blood,blood_culture_2_bottles,,NaT,NaT,,Staphylococcus aureus,,,NaT,NaT,0 days 23:39:00,positive,MAON01,"Onkologen avd 1, Jan Waldenströms gata 18, MAL...",MALMÖ,staphylococcus aureus,23OBM507589,bottle 2,LIMS,23OBM507589AEA,FYND,Val av fynd/art,449263643396,2023-04-29 14:20:00,2023-04-29 00:00:00,False,staphylococcus aureus,staphylococcus aureus,staphylococcus aureus,gram_positive,Aerobic,True,1.0,0.0,mono,False,,staphylococcus aureus:20020799 | staphylococcu...
241586,20066060,1029236,2022,,,2022-01-05,2022-01-05,2022-01-11,female,72,,blood,blood_culture_2_bottles,,NaT,NaT,,Staphylococcus aureus,,,NaT,NaT,0 days 11:04:00,positive,MAKAHA,"Hjärtavdelning, Vårdavd 1, Carl Bertil Laurell...",MALMÖ,staphylococcus aureus,22OBM500306,bottle 1,LIMS,22OBM500306ANA,FYND,Val av fynd/art,446594773813,2022-01-05 10:40:00,2022-01-05 14:12:00,False,staphylococcus aureus,staphylococcus aureus,staphylococcus aureus,gram_positive,Aerobic,True,1.0,0.0,mono,False,,staphylococcus aureus:20066060 | staphylococcu...
241587,20268249,1024406,2022,,,2022-02-02,2022-03-02,2022-03-06,male,0,,blood,blood_culture_1_bottle,,NaT,NaT,,Micrococcus species,,,NaT,NaT,NaT,positive,LUBSAK,"Akutmottagning barn och ungdom Lund, Akutgatan...",LUND,micrococcus species,22OBL504718,,LIMS,22OBL504718PDB,FYND,Val av fynd/art,,2022-02-02 22:18:00,2022-03-02 00:00:00,True,micrococcus species,other,other,gram_positive,Aerobic,False,0.0,2.0,cont,False,,micrococcus species:20268249 | staphylococcus ...
241588,20150168,1058465,2021,,,2021-12-07,2021-12-07,2021-12-11,male,82,,blood,blood_culture_2_bottles,,NaT,NaT,,Aerococcus urinae,,,NaT,NaT,0 days 20:40:00,positive,KDINNN,"Infektionsmottagning Kristianstad, Centralsjuk...",KRISTIANSTAD,aerococcus urinae,21OBK511703,bottle 1,LIMS,21OBK511703ANA,FYND,Val av fynd/art,446598372307,2021-12-07 05:05:00,2021-12-07 12:04:00,False,aerococcus species,other,other,gram_positive,Aerobic,True,2.0,0.0,poly,True,Aerococcus urinae | Enterobacter cloacae-kompl...,aerococcus urinae:20084141 | aerococcus urinae...


## Add TTP Column


In [25]:
casefinding_with_ttp = wwbakt_cleaner.add_ttp(
    df=casefinding_microbiological_classified,
    result_col_name="bottle_outcome",
    incubation_date_col_name="incubation_date",
    ttd_col_name="TTD",
    result_date_col_name="bottle_report_date",
)

In [26]:
print('labnr',casefinding_with_ttp.labnr.nunique())
print('patienter',casefinding_with_ttp.patient_id.nunique())
print('rader',casefinding_with_ttp.shape[0])

labnr 124298
patienter 58991
rader 241590


## Filter


In [27]:
test1 = set(casefinding_with_ttp.patient_id.unique().tolist())

def filter_microbiology(
    df: pd.DataFrame,
    remove_negative_samples: bool = True,
    remove_contaminants: bool = True,
):

    df = df.copy()
    if remove_negative_samples:
        df = df[df["bottle_outcome"].str.lower().str.startswith("pos", na=False)]

    if remove_contaminants:
        df = df[df["mono_poly_contamination"] != "cont"]

    return df


casefinding_filtered = filter_microbiology(
    casefinding_with_ttp, remove_negative_samples=True, remove_contaminants=True
)

test2 = set(casefinding_filtered.patient_id.unique().tolist())

print('labnr',casefinding_filtered.labnr.nunique())
print('patienter',casefinding_filtered.patient_id.nunique())
print('rader',casefinding_filtered.shape[0])

labnr 103572
patienter 45852
rader 199624


## Group into Episodes

In [28]:
casefinding_with_episodes = wwbakt_cleaner.determine_episode(
    df=casefinding_filtered,
    columns_to_sort_by=["patient_id", "sample_date"],
    patient_id_col_name="patient_id",
    sample_date_col_name="sample_date",
    time=30,
)

In [29]:
print('sid: ', casefinding_with_episodes.labnr.nunique())
print('patient_ids: ', casefinding_with_episodes.patient_id.nunique())
print('rader: ', casefinding_with_episodes.shape[0])
print('episode id',casefinding_with_episodes.episode_id.nunique())




sid:  103572
patient_ids:  45852
rader:  199624
episode id 56468


## Add time to sample dates

In [30]:
mask = casefinding_with_episodes["sample_datetime"].notna()
casefinding_with_episodes.loc[mask, "sample_date"] = casefinding_with_episodes.loc[mask, "sample_datetime"]

## Deduplicate Data


In [31]:
def deduplicate_microbiological(df, keep_only_index_day: bool = True):
    df = df.copy()

    if keep_only_index_day:

        # set index_day == True for samples taken on the first day of the episode
        df["sample_taken_on_index_day"] = df["sample_date"] == df.groupby(
            ["episode_id", "patient_id"]
        )["sample_date"].transform("min")
        
        # filter to only keep samples taken on the index day
        df = df[df["sample_taken_on_index_day"] == True]
        
        # update columns to reflect if the finding is polymicrobial or mono
        df.loc[df.mono_poly_contamination == 'poly', ['species','category_1','category_2','genus','bacterial_class']] = ['polymicrobial','polymicrobial','polymicrobial','polymicrobial','polymicrobial']

        # keep the row with the shortest TTP
        df = df.sort_values(by=["TTP_hours"], ascending=True)
        df = df.drop_duplicates(
            subset=["episode_id"], keep="first"
        )

    return df


casefinding_with_episodes_dedub = deduplicate_microbiological(
    df=casefinding_with_episodes
)

In [32]:
print('sid: ', casefinding_with_episodes_dedub.labnr.nunique())
print('patient_ids: ', casefinding_with_episodes_dedub.patient_id.nunique())
print('episode id',casefinding_with_episodes_dedub.episode_id.nunique())
print('rader: ', casefinding_with_episodes_dedub.shape[0])

sid:  56468
patient_ids:  45852
episode id 56468
rader:  56468


## Add SIR data
# TODO
- Tänker att man kan ha tre olika kolumner t.ex. SIR_S, SIR_I och SIR_R
- För varje rad i kolumnerna så skapar vi en sträng med följande format:
    species 1: antibiotika 1 | antibiotika 2 | ... \n
    species 2: antibiotika 1 | antibiotika 2 | ... 
    ...

In [34]:
casefinding_with_episodes_dedub

Unnamed: 0,sample_id,patient_id,year,section,section_code,sample_date,incubation_date,result_date,sex,age,material_code,sample_material,examination,Qtnr,Analys,Resultat,Banr,microorganism,Pbn,bottle_media,bottle_start_date,bottle_report_date,TTD,bottle_outcome,department_code,department,hospital,species,labnr,bottle_nr,data_source,lid,analysis_code,analysis_name,bottle_lid,sample_datetime,sample_arrival_datetime,potential_contaminant,genus,category_1,category_2,gram,anaerobe,relevant,nr_relevant_findings,nr_non_relevant_findings,mono_poly_contamination,polymicrobial,which_polymicrobial,which_sample_ids,TTP,TTP_hours,days_diff,episode_nr,episode_id,sample_taken_on_index_day,bacterial_class
92534,20121107,1027635,2022,,,2022-09-13 18:30:00,2022-09-13,2022-09-16,male,88,,blood,blood_culture_2_bottles,,NaT,NaT,,Escherichia coli,,,NaT,NaT,0 days 00:00:00,positive,YSMEAC,"Medicin Akutvård, , YSTAD, Lasarettet i Ystad",YSTAD,polymicrobial,22OBY504285,bottle 1,LIMS,22OBY504285ANB,FYND,Val av fynd/art,446591535427,2022-09-13 18:30:00,2022-09-13 00:00:00,False,polymicrobial,polymicrobial,polymicrobial,gram_negative,Aerobic,True,2.0,0.0,poly,True,Escherichia coli | Klebsiella pneumoniae,&klebsiella:20007527 | &klebsiella:20024211 | ...,0 days 00:00:00,0.000000e+00,0.0,1,1027635_1,True,polymicrobial
17104,20005532,1005025,2021,,,2021-10-10 14:00:00,2021-10-10,2021-10-12,male,77,,blood,blood_culture_2_bottles,,NaT,NaT,,Klebsiella aerogenes,,,NaT,NaT,0 days 00:00:00,positive,MAACIT,"Akutmottagning, Ruth Lundskogs gata 3, pl 1, M...",MALMÖ,polymicrobial,21OBM517269,bottle 1,LIMS,21OBM517269ANA,FYND,Val av fynd/art,446590215763,2021-10-10 14:00:00,2021-10-10 00:00:00,False,polymicrobial,polymicrobial,polymicrobial,gram_negative,Aerobic,True,2.0,0.0,poly,True,Enterococcus faecium | Klebsiella aerogenes,&klebsiella:20005532 | &klebsiella:20021150 | ...,0 days 00:00:00,0.000000e+00,2364.0,3,1005025_3,True,polymicrobial
134884,20218076,1040024,2022,,,2022-01-26 21:35:00,2022-01-26,2022-01-28,female,77,,blood,blood_culture_2_bottles,,NaT,NaT,,Escherichia coli,,,NaT,NaT,0 days 00:00:00,positive,KDINNN,"Infektionsmottagning Kristianstad, Centralsjuk...",KRISTIANSTAD,escherichia coli,22OBK500966,bottle 1,LIMS,22OBK500966AEA,FYND,Val av fynd/art,449254482363,2022-01-26 21:35:00,2022-01-26 00:00:00,False,escherichia coli,escherichia coli,enterobacterales,gram_negative,Aerobic,True,1.0,0.0,mono,False,,escherichia coli:20020866 | escherichia coli:2...,0 days 00:00:00,0.000000e+00,,1,1040024_1,True,
131415,20176147,1039071,2022,,,2022-12-10 14:30:00,2022-12-11,2022-12-13,male,47,,blood,blood_culture_2_bottles,,NaT,NaT,,Staphylococcus aureus,,,NaT,NaT,0 days 00:00:00,positive,ÄNMEAC,"Medicinska klin akutmottagning, , ÄNGELHOLM, Ä...",ÄNGELHOLM,staphylococcus aureus,22OB0506088,bottle 1,LIMS,22OB0506088AEA,FYND,Val av fynd/art,449276065055,2022-12-10 14:30:00,2022-12-11 13:03:00,False,staphylococcus aureus,staphylococcus aureus,staphylococcus aureus,gram_positive,Aerobic,True,1.0,0.0,mono,False,,staphylococcus aureus:20032576 | staphylococcu...,0 days 00:00:00,0.000000e+00,,1,1039071_1,True,
59942,20005604,1017664,2023,,,2023-02-01 23:10:00,2023-02-02,2023-02-03,male,74,,blood,blood_culture_2_bottles,,NaT,NaT,,Escherichia coli,,,NaT,NaT,0 days 00:00:00.000000001,positive,KDMENN,"Medicinklinikens mottagning, , KRISTIANSTAD, C...",KRISTIANSTAD,escherichia coli,23OBK501230,bottle 1,LIMS,23OBK501230ANA,FYND,Val av fynd/art,446555455218,2023-02-01 23:10:00,2023-02-02 00:00:00,False,escherichia coli,escherichia coli,enterobacterales,gram_negative,Aerobic,True,1.0,0.0,mono,False,,escherichia coli:20005604 | escherichia coli:2...,0 days 00:00:00.000000001,2.777778e-13,0.0,1,1017664_1,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199526,20206418,1058977,2011,OB0,502610,2011-02-23 20:50:00,2011-02-24,2011-02-28,male,50,65012,blood,blood_culture,,NaT,NaT,10390,Staphylococcus species (KNS),10390-2,,NaT,NaT,NaT,pos,LUON88,Onk klin avd 88 LUND,LUND,staphylococcus species,11OB0502610,bottle 1,wwBakt,,,,,2011-02-23 20:50:00,2011-02-24 00:00:00,True,staphylococcus species,staphylococcus species,staphylococcus species,gram_positive,Aerobic,True,1.0,0.0,mono,False,,staphylococcus species:20012819 | staphylococc...,NaT,,,1,1058977_1,True,
199576,20032566,1058990,2014,OBM,100575,2014-12-13 00:00:00,2014-12-14,2014-12-18,female,81,65012,blood,blood_culture,,NaT,NaT,29910,Veillonella parvula,29910-1,,NaT,NaT,NaT,pos,TRACME,"Akutmottagning, medicin TRELLEBORG",TRELLEBORG,veillonella parvula,14OBM100575,bottle 1,wwBakt,,,,,2014-12-13 00:00:00,2014-12-14 00:00:00,False,veillonella species,other,other,gram_negative,Anaerobic,True,1.0,0.0,mono,False,,veillonella parvula:20032566,NaT,,,1,1058990_1,True,
199578,20189816,1058991,2011,OB0,101646,2011-01-23 00:00:00,2011-01-24,2011-01-26,female,80,65012,blood,blood_culture,,NaT,NaT,23000,Escherichia coli,23000-1,,NaT,NaT,NaT,pos,MAMENN,Medicinmottagning Malmö MALMÖ,MALMÖ,escherichia coli,11OB0101646,bottle 1,wwBakt,,,,,2011-01-23 00:00:00,2011-01-24 00:00:00,False,escherichia coli,escherichia coli,enterobacterales,gram_negative,Aerobic,True,1.0,0.0,mono,False,,escherichia coli:20163475 | escherichia coli:2...,NaT,,,1,1058991_1,True,
199582,20244003,1058991,2012,OB0,107665,2012-04-08 10:00:00,2012-04-09,2012-04-11,female,82,65012,blood,blood_culture,,NaT,NaT,23000,Escherichia coli,23000-1,,NaT,NaT,NaT,pos,MAACIT,Akutmottagning MALMÖ,MALMÖ,escherichia coli,12OB0107665,bottle 1,wwBakt,,,,,2012-04-08 10:00:00,2012-04-09 00:00:00,False,escherichia coli,escherichia coli,enterobacterales,gram_negative,Aerobic,True,1.0,0.0,mono,False,,escherichia coli:20221511 | escherichia coli:2...,NaT,,441.0,2,1058991_2,True,


## Save Processed microbiology


In [33]:
if not os.path.exists(paths.STORE_MICROBIOLOGY_PATH):
    os.makedirs(paths.STORE_MICROBIOLOGY_PATH)

casefinding_with_episodes_dedub.to_parquet(
    f"{paths.STORE_MICROBIOLOGY_PATH}/microbiology_dedub.parquet",
)
casefinding_with_episodes.to_parquet(
    f"{paths.STORE_MICROBIOLOGY_PATH}/microbiology_without_contaminants.parquet",
)
sir_data_cleaned_renamed.to_parquet(f"{paths.STORE_MICROBIOLOGY_PATH}/sir.parquet")