In [1]:
import json
from sklearn.model_selection import train_test_split
from typing import Dict, List

In [2]:
with open('data/icd_coding.json', 'r') as f:
    icd_coding = json.load(f)

In [3]:
data = [{k: v} for k, v in icd_coding[0].items()]
len(data)

2818

In [4]:
import torch
# Make train test split
train_pct: float = 0.8
dataset_length: int = len(data)
train_length: int = int(dataset_length * train_pct)
val_length: int = int((dataset_length - train_length) / 2)
test_length: int = val_length

# check if the lengths are correct
if (train_length + 2 * val_length) != dataset_length:
    test_length = val_length + 1

# Split dataset
train_set, val_set, test_set = torch.utils.data.random_split(
    data,
    [train_length, val_length, test_length],
    generator=torch.Generator().manual_seed(42),
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_idx = train_set.indices
val_idx = val_set.indices
test_idx = test_set.indices

In [6]:
train_ = [data[i] for i in train_idx]
val_ = [data[i] for i in val_idx]
test_ = [data[i] for i in test_idx]

In [7]:
from typing_extensions import TypedDict

Article = TypedDict(
    "Article",
    {
        "id_": str,
        "title": str,
        "description": str,
        "body": str,
        "chapters": List[str],
        "blocks": List[str],
        "categories": List[str],
    },
)

def get_relevant_data(data: List[Dict]) -> List[Article]:

    relevant_data: List[Article] = []
    for article in data:

        id_, article_data = article.popitem()

        title_of_first_paragraph: str = list(article_data["texts"].keys())[0]

        relevant_data.append(
            Article(
                id_=id_,
                title=article_data["MetaTags"]["title"],
                description=article_data["MetaTags"]["description"],
                body=article_data["texts"][title_of_first_paragraph],
                chapters=article_data["MetaTags"]["ICD_details"]["chapters"],
                blocks=article_data["MetaTags"]["ICD_details"]["blocks"],
            )
        )
    return relevant_data

In [8]:
# Map ICD codes to int from train
train = get_relevant_data(train_)
val = get_relevant_data(val_)
test = get_relevant_data(test_)


In [9]:
all_chapters = set()
all_blocks = set()
all_categories = set()

for article in train:
    for chapter in article["chapters"]:
        all_chapters.add(chapter)
    for block in article["blocks"]:
        all_blocks.add(block)

for article in val:
    for chapter in article["chapters"]:
        all_chapters.add(chapter)
    for block in article["blocks"]:
        all_blocks.add(block)

for article in test:
    for chapter in article["chapters"]:
        all_chapters.add(chapter)
    for block in article["blocks"]:
        all_blocks.add(block)

In [10]:
[i['id_'] for i in test]

['24701',
 '27703',
 '22070',
 '15333',
 '15194',
 '13105',
 '16205',
 '32272',
 '16165',
 '24244',
 '16517',
 '16564',
 '26022',
 '24729',
 '24284',
 '11963',
 '13366',
 '22749',
 '15786',
 '82312',
 '76532',
 '16594',
 '87443',
 '84708',
 '32177',
 '19497',
 '84840',
 '54766',
 '14542',
 '25535',
 '49138',
 '32194',
 '22067',
 '15509',
 '25270',
 '14550',
 '20621',
 '21614',
 '32191',
 '11885',
 '13851',
 '12690',
 '12314',
 '14806',
 '30997',
 '25263',
 '87140',
 '16033',
 '44059',
 '27038',
 '12739',
 '15205',
 '33692',
 '23106',
 '24251',
 '15752',
 '22097',
 '13092',
 '27007',
 '15822',
 '16032',
 '20619',
 '24779',
 '15808',
 '15998',
 '42447',
 '32132',
 '21346',
 '32091',
 '15357',
 '90086',
 '25799',
 '16455',
 '24783',
 '27287',
 '22717',
 '27460',
 '13104',
 '28039',
 '82740',
 '22712',
 '11933',
 '24765',
 '32655',
 '20618',
 '24813',
 '26395',
 '12263',
 '12273',
 '24781',
 '13702',
 '15830',
 '26010',
 '32131',
 '13711',
 '15334',
 '16583',
 '14524',
 '16669',
 '21327',


In [11]:
# Assert the the order is always the same
all_chapters_sorted = sorted(all_chapters)
all_blocks_sorted = sorted(all_blocks)
all_categories_sorted = sorted(all_categories)

print(len(all_chapters_sorted), len(all_blocks_sorted), len(all_categories_sorted))


21 199 0


In [12]:
# Map code to int and back
int_to_chapter = {idx: code for idx, code in enumerate(all_chapters_sorted)}
chapter_to_int = {code: idx for idx, code in int_to_chapter.items()}
int_to_block = {idx: code for idx, code in enumerate(all_blocks_sorted)}
block_to_int = {code: idx for idx, code in int_to_block.items()}


#### Tokenize text

In [13]:
import spacy

nlp = spacy.load("da_core_news_sm")

train_processed = []

for article in train:

    title = article["title"]
    description = article["description"]
    body = article["body"]

    text = description + '. ' + body

    title_doc = nlp(title)
    text_doc = nlp(text)

    article_json = {
        "id": article["id_"],
        "title": [],
        "chapters": [],
        "blocks": [],
        "labels": []
    }

    title_tokens = []
    for token in title_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        title_tokens.append(token.text.lower())
    
    text_tokens = []
    for token in text_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        text_tokens.append(token.text.lower())

    chapters_converted = [chapter_to_int[chapter] for chapter in article["chapters"]]
    blocks_converted = [block_to_int[block] for block in article["blocks"]]

    labels = []
    for chapter in chapters_converted:
        labels.append(chapter)
    for block in blocks_converted:
        labels.append(block + len(int_to_chapter.keys()))

    article_json["title"] = title_tokens
    article_json["text"] = text_tokens
    article_json["chapters"] = chapters_converted
    article_json["blocks"] = blocks_converted
    article_json["labels"] = labels

    train_processed.append(article_json)

val_processed = []
for article in val:
    
    title = article["title"]
    description = article["description"]
    body = article["body"]

    text = description + '. ' + body

    title_doc = nlp(title)
    text_doc = nlp(text)

    article_json = {
        "id": article["id_"],
        "title": [],
        "chapters": [],
        "blocks": [],
        "labels": []
    }

    title_tokens = []
    for token in title_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        title_tokens.append(token.text.lower())
    
    text_tokens = []
    for token in text_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        text_tokens.append(token.text.lower())

    chapters_converted = [chapter_to_int[chapter] for chapter in article["chapters"]]
    blocks_converted = [block_to_int[block] for block in article["blocks"]]

    labels = []
    for chapter in chapters_converted:
        labels.append(chapter)
    for block in blocks_converted:
        labels.append(block + len(int_to_chapter.keys()))

    article_json["title"] = title_tokens
    article_json["text"] = text_tokens
    article_json["chapters"] = chapters_converted
    article_json["blocks"] = blocks_converted
    article_json["labels"] = labels

    val_processed.append(article_json)

test_processed = []
for article in test:
        
    title = article["title"]
    description = article["description"]
    body = article["body"]

    text = description + '. ' + body

    title_doc = nlp(title)
    text_doc = nlp(text)

    article_json = {
        "id": article["id_"],
        "title": [],
        "text": [],
        "chapters": [],
        "blocks": [],
        "labels": []
    }

    title_tokens = []
    for token in title_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        title_tokens.append(token.text.lower())
    
    text_tokens = []
    for token in text_doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue
        text_tokens.append(token.text.lower())

    chapters_converted = [chapter_to_int[chapter] for chapter in article["chapters"]]
    blocks_converted = [block_to_int[block] for block in article["blocks"]]

    labels = []
    for chapter in chapters_converted:
        labels.append(chapter)
    for block in blocks_converted:
        labels.append(block + len(int_to_chapter.keys()))

    article_json["title"] = title_tokens
    article_json["text"] = text_tokens
    article_json["chapters"] = chapters_converted
    article_json["blocks"] = blocks_converted
    article_json["labels"] = labels

    test_processed.append(article_json)

KeyboardInterrupt: 

In [84]:
# make jsonl file
# with open("data/train1.json", "w") as f:
#     for article in train_processed:
#         f.write(json.dumps(article) + "\n")

# # make jsonl file
# with open("data/val1.json", "w") as f:
#     for article in val_processed:
#         f.write(json.dumps(article) + "\n")

# # make jsonl file
# with open("data/test1.json", "w") as f:
#     for article in test_processed:
#         f.write(json.dumps(article) + "\n")



# Visualisation and results

In [14]:
# Map code to int and back
int_to_chapter = {idx: code for idx, code in enumerate(all_chapters_sorted)}
chapter_to_int = {code: idx for idx, code in int_to_chapter.items()}
int_to_block = {idx: code for idx, code in enumerate(all_blocks_sorted)}
block_to_int = {code: idx for idx, code in int_to_block.items()}


In [36]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import json
with open("output/1658943157/predictions.json", 'r') as f:
    data = []
    for line in f.readlines():
        article = json.loads(line.strip())
        processed = {'id': article['id'], "predict_scores": article["predict_scores"]}
        chapter_labels = []
        block_labels = []
        chapter_preds = []
        block_preds = []
        for label in article["labels"]:
            if label > 20:
                block_labels.append(int_to_block[label-21])
            else:
                chapter_labels.append(int_to_chapter[label])
        for label in article["predict_labels"]:
            if label > 20:
                block_preds.append(int_to_block[label-21])
            else:
                chapter_preds.append(int_to_chapter[label])
        processed["chapters_labels"] = chapter_labels
        processed["chapters_preds"] = chapter_preds
        processed["blocks_labels"] = block_labels
        processed["blocks_preds"] = block_preds
        data.append(processed)



In [33]:
data[0]

{'id': '24701',
 'predict_scores': [0.9999, 0.6054, 0.9936],
 'chapter_labels': ['XI'],
 'chapter_preds': ['XI'],
 'block_labels': ['K90-K93'],
 'block_preds': ['K55-K64', 'K90-K93']}

In [34]:
classes = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX', 'XXI']
multilabel_binarizer = MultiLabelBinarizer(classes=classes)
multilabel_binarizer.fit(classes)

In [37]:
labels, preds = [], []
for d in data:
    labels.append(d['chapters_labels'])
    preds.append(d['chapters_preds'])

labels_transformed = multilabel_binarizer.transform(labels)
preds_transformed = multilabel_binarizer.transform(preds)

matrix = multilabel_confusion_matrix(labels_transformed, preds_transformed)
matrix

array([[[250,   5],
        [  9,  18]],

       [[268,   0],
        [  8,   6]],

       [[272,   4],
        [  4,   2]],

       [[256,   3],
        [  9,  14]],

       [[255,   3],
        [  8,  16]],

       [[256,   4],
        [ 10,  12]],

       [[272,   2],
        [  2,   6]],

       [[277,   3],
        [  0,   2]],

       [[258,   4],
        [  8,  12]],

       [[271,   3],
        [  2,   6]],

       [[244,  13],
        [ 10,  15]],

       [[263,   6],
        [  3,  10]],

       [[241,  10],
        [ 11,  20]],

       [[259,   7],
        [  7,   9]],

       [[268,   2],
        [  4,   8]],

       [[270,   1],
        [ 10,   1]],

       [[252,   5],
        [  8,  17]],

       [[253,   9],
        [ 16,   4]],

       [[251,   5],
        [  5,  21]],

       [[282,   0],
        [  0,   0]],

       [[274,   1],
        [  5,   2]]])

### Stats

In [38]:
# All the times where we are fully correct with chapters but miss blocks
counter_1 = 0
for d in data:
    if d['chapters_preds'] == d['chapters_labels'] and d['blocks_preds'] != d['blocks_labels']:
        counter_1 += 1
counter_1
print(f"We're fully correct with chapters but miss blocks {counter_1} times out of {len(data)}")

We're fully correct with chapters but miss blocks 97 times out of 282


In [39]:
# All the times we get at least one chapter correct
counter_2 = 0
for d in data:
    for i in d['chapters_preds']:
        if i in d['chapters_labels']:
            counter_2 += 1
            break
counter_2
print(f"We get at least one chapter correct {counter_2} times out of {len(data)}")

We get at least one chapter correct 196 times out of 282


In [40]:
# all the times we predict everything correctly
counter_3 = 0
for d in data:
    if d['chapters_preds'] == d['chapters_labels'] and d['blocks_preds'] == d['blocks_labels']:
        counter_3 += 1
counter_3
print(f"We predict everything correctly {counter_3} times out of {len(data)}")


We predict everything correctly 57 times out of 282


In [41]:
# all the times we predict all the blocks correctly
counter_4 = 0
for d in data:
    if d['blocks_preds'] == d['blocks_labels']:
        counter_4 += 1
counter_4
print(f"We predict all the blocks correctly {counter_4} times out of {len(data)}")
# This is probably due to that we missed one chapter but only had blocks from on the chapters

We predict all the blocks correctly 60 times out of 282


In [42]:
# All the times we get at least one block correct
counter_5 = 0
for d in data:
    for i in d['blocks_preds']:
        if i in d['blocks_labels']:
            counter_5 += 1
            break
counter_5
print(f"We get at least one block correct {counter_5} times out of {len(data)}")

We get at least one block correct 92 times out of 282


In [43]:
# All the times we predict chapter(s) correctly, but don't predict any blocks
counter_6 = 0
for d in data:
    if d['chapters_preds'] == d['chapters_labels'] and d['blocks_preds'] == []:
        counter_6 += 1
counter_6
print(f"We predict chapter(s) correctly, but don't predict any blocks {counter_6} times out of {len(data)}")


We predict chapter(s) correctly, but don't predict any blocks 65 times out of 282
