In [85]:
import json
from sklearn.model_selection import train_test_split
from typing import Dict, List

In [86]:
with open('data/icd_coding.json', 'r') as f:
    icd_coding = json.load(f)

In [87]:
data = [{k: v} for k, v in icd_coding[0].items()]
len(data)

2818

In [88]:
train_, val_ = train_test_split(data, test_size=0.2, random_state=42)
val_, test_ = train_test_split(val_, test_size=0.5, random_state=42)

In [89]:
from typing_extensions import TypedDict

Article = TypedDict(
    "Article",
    {
        "id_": str,
        "title": str,
        "description": str,
        "body": str,
        "chapters": List[str],
        "blocks": List[str],
        "categories": List[str],
    },
)

def get_relevant_data(data: List[Dict]) -> List[Article]:

    relevant_data: List[Article] = []
    for article in data:

        id_, article_data = article.popitem()

        title_of_first_paragraph: str = list(article_data["texts"].keys())[0]

        relevant_data.append(
            Article(
                id_=id_,
                title=article_data["MetaTags"]["title"],
                description=article_data["MetaTags"]["description"],
                body=article_data["texts"][title_of_first_paragraph],
                chapters=article_data["MetaTags"]["ICD_details"]["chapters"],
                blocks=article_data["MetaTags"]["ICD_details"]["blocks"],
            )
        )
    return relevant_data

In [90]:
# Map ICD codes to int from train
train = get_relevant_data(train_)
val = get_relevant_data(val_)
test = get_relevant_data(test_)


In [91]:
all_chapters = set()
all_blocks = set()
all_categories = set()

for article in train:
    for chapter in article["chapters"]:
        all_chapters.add(chapter)
    for block in article["blocks"]:
        all_blocks.add(block)

for article in val:
    for chapter in article["chapters"]:
        all_chapters.add(chapter)
    for block in article["blocks"]:
        all_blocks.add(block)

for article in test:
    for chapter in article["chapters"]:
        all_chapters.add(chapter)
    for block in article["blocks"]:
        all_blocks.add(block)

In [92]:
# Assert the the order is always the same
all_chapters_sorted = sorted(all_chapters)
all_blocks_sorted = sorted(all_blocks)
all_categories_sorted = sorted(all_categories)

print(len(all_chapters_sorted), len(all_blocks_sorted), len(all_categories_sorted))


21 199 0


In [66]:
# Map code to int and back
int_to_chapter = {idx: code for idx, code in enumerate(all_chapters_sorted)}
chapter_to_int = {code: idx for idx, code in int_to_chapter.items()}
int_to_block = {idx: code for idx, code in enumerate(all_blocks_sorted)}
block_to_int = {code: idx for idx, code in int_to_block.items()}


#### Tokenize text

In [83]:
import spacy

nlp = spacy.load("da_core_news_sm")

train_processed = []

for article in train:

    title = article["title"]
    description = article["description"]
    body = article["body"]

    text = description + '. ' + body

    title_doc = nlp(title)
    text_doc = nlp(text)

    article_json = {
        "id": article["id_"],
        "title": [],
        "chapters": [],
        "blocks": [],
        "labels": []
    }

    title_tokens = []
    for token in title_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        title_tokens.append(token.text.lower())
    
    text_tokens = []
    for token in text_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        text_tokens.append(token.text.lower())

    chapters_converted = [chapter_to_int[chapter] for chapter in article["chapters"]]
    blocks_converted = [block_to_int[block] for block in article["blocks"]]

    labels = []
    for chapter in chapters_converted:
        labels.append(chapter)
    for block in blocks_converted:
        labels.append(block + len(int_to_chapter.keys()))

    article_json["title"] = title_tokens
    article_json["text"] = text_tokens
    article_json["chapters"] = chapters_converted
    article_json["blocks"] = blocks_converted
    article_json["labels"] = labels

    train_processed.append(article_json)

val_processed = []
for article in val:
    
    title = article["title"]
    description = article["description"]
    body = article["body"]

    text = description + '. ' + body

    title_doc = nlp(title)
    text_doc = nlp(text)

    article_json = {
        "id": article["id_"],
        "title": [],
        "chapters": [],
        "blocks": [],
        "labels": []
    }

    title_tokens = []
    for token in title_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        title_tokens.append(token.text.lower())
    
    text_tokens = []
    for token in text_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        text_tokens.append(token.text.lower())

    chapters_converted = [chapter_to_int[chapter] for chapter in article["chapters"]]
    blocks_converted = [block_to_int[block] for block in article["blocks"]]

    labels = []
    for chapter in chapters_converted:
        labels.append(chapter)
    for block in blocks_converted:
        labels.append(block + len(int_to_chapter.keys()))

    article_json["title"] = title_tokens
    article_json["text"] = text_tokens
    article_json["chapters"] = chapters_converted
    article_json["blocks"] = blocks_converted
    article_json["labels"] = labels

    val_processed.append(article_json)

test_processed = []
for article in test:
        
    title = article["title"]
    description = article["description"]
    body = article["body"]

    text = description + '. ' + body

    title_doc = nlp(title)
    text_doc = nlp(text)

    article_json = {
        "id": article["id_"],
        "title": [],
        "text": [],
        "chapters": [],
        "blocks": [],
        "labels": []
    }

    title_tokens = []
    for token in title_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        title_tokens.append(token.text.lower())
    
    text_tokens = []
    for token in text_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        text_tokens.append(token.text.lower())

    chapters_converted = [chapter_to_int[chapter] for chapter in article["chapters"]]
    blocks_converted = [block_to_int[block] for block in article["blocks"]]

    labels = []
    for chapter in chapters_converted:
        labels.append(chapter)
    for block in blocks_converted:
        labels.append(block + len(int_to_chapter.keys()))

    article_json["title"] = title_tokens
    article_json["text"] = text_tokens
    article_json["chapters"] = chapters_converted
    article_json["blocks"] = blocks_converted
    article_json["labels"] = labels

    test_processed.append(article_json)

In [84]:
# make jsonl file
with open("data/train.json", "w") as f:
    for article in train_processed:
        f.write(json.dumps(article) + "\n")

# make jsonl file
with open("data/val.json", "w") as f:
    for article in val_processed:
        f.write(json.dumps(article) + "\n")

# make jsonl file
with open("data/test.json", "w") as f:
    for article in test_processed:
        f.write(json.dumps(article) + "\n")

