# Data Formatting

This notebook formats the data into a more workable format. Each row in the data represents a single admission. In the notes column, we have all the notes related to said admission, minus the discharge summary. This goes into the summary column. The new format makes it much simpler to work with the data, and for models to interact with it, as the notes are presented as a single blob of text.


In [None]:
import pandas as pd

In [None]:
train_data = pd.read_csv("./data/single-discharge-8k-train.csv")
test_data = pd.read_csv("./data/single-discharge-8k-test.csv")

In [None]:
admissions_train = train_data["HADM_ID"].unique()
admissions_test = test_data["HADM_ID"].unique()

formatted_data_train = {}

total = 0

for admission in admissions_train:
    total += 1
    notes = train_data[train_data["HADM_ID"] == admission]
    notes = (
        notes[notes["CATEGORY"] != "Discharge summary"]
        .sort_values(["CHARTDATE", "CHARTTIME"])["TEXT"]
        .tolist()
    )
    notes = "\n".join(notes)
    summary = train_data[train_data["HADM_ID"] == admission]
    summary = summary[summary["CATEGORY"] == "Discharge summary"]["TEXT"].iloc[0]
    formatted_data_train[admission] = {"notes": notes, "summary": summary}

    if total % 1000 == 0:
        print(f"Reviewed {total} admissions")

print("Train data formatted")

formatted_data_test = {}

for admission in admissions_test:
    notes = test_data[test_data["HADM_ID"] == admission]
    notes = (
        notes[notes["CATEGORY"] != "Discharge summary"]
        .sort_values(["CHARTDATE", "CHARTTIME"])["TEXT"]
        .tolist()
    )
    notes = "\n".join(notes)
    summary = test_data[test_data["HADM_ID"] == admission]
    summary = summary[summary["CATEGORY"] == "Discharge summary"]["TEXT"].iloc[0]
    formatted_data_test[admission] = {"notes": notes, "summary": summary}

print("Test data formatted")

train_data_formatted = pd.DataFrame.from_dict(formatted_data_train, orient="index")
test_data_formatted = pd.DataFrame.from_dict(formatted_data_test, orient="index")

In [None]:
# Rename index to admission

train_data_formatted.index.name = "admission"
test_data_formatted.index.name = "admission"

In [None]:
train_data_formatted.to_csv("./data/single-discharge-8k-train-formatted.csv")
test_data_formatted.to_csv("./data/single-discharge-8k-test-formatted.csv")