# Make Dataset
x = (Image, Question)
y = Answer

In [52]:
import config
import pathlib
import json
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import numpy as np

In [53]:
questions_to_be_answered = ["What type of procedure is the image taken from?",
"How many instrumnets are in the image?",
"Have all polyps been removed?",
"Where in the image is the abnormality?",
"Is this finding easy to detect?",
"Where in the image is the instrument?",
"Is there a green/black box artifact?",
"Are there any abnormalities in the image?",
"Is there text?",
"Are there any anatomical landmarks in the image?",
"What color is the abnormality?",
"Are there any instruments in the image?",
"What color is the anatomical landmark?",
"Where in the image is the anatomical landmark?",
"How many findings are present?",
"What is the size of the polyp?",
"How many polyps are in the image?",
"What type of polyp is present?"]

## Data Cleaning

### Replace misspelled question
"How many instrumnets are in the image?"

-> No, in the submission check script of image clef, the question is also posed with the spelling error. Should be a minimal difference in the encoding anyways

In [54]:
# labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]
# with open(labels_json_path, "r") as f:
#     data = json.load(f)
    
# for image in data:
#     for label in image["Labels"]:
#         if label["Question"] == "How many instrumnets are in the image?":
#             label["Question"] = "How many instruments are in the image?"
            
        
# with open(labels_json_path, "w") as f:
#     json.dump(data, f, indent=True)
    

## Encoder

In [55]:
labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]


with open(labels_json_path, "r") as f:
    data = json.load(f)

questions_to_be_answered = ["What type of procedure is the image taken from?",
"How many instrumnets are in the image?",
"Have all polyps been removed?",
"Where in the image is the abnormality?",
"Is this finding easy to detect?",
"Where in the image is the instrument?",
"Is there a green/black box artifact?",
"Are there any abnormalities in the image?",
"Is there text?",
"Are there any anatomical landmarks in the image?",
"What color is the abnormality?",
"Are there any instruments in the image?",
"What color is the anatomical landmark?",
"Where in the image is the anatomical landmark?",
"How many findings are present?",
"What is the size of the polyp?",
"How many polyps are in the image?",
"What type of polyp is present?"]

question_answers = {}
for image in data:
    for label in image["Labels"]:
        if label["Question"] in questions_to_be_answered:
            if label["Question"] not in question_answers:
                question_answers[label["Question"]] = []
            
            # Add answers to the question , then make sure it wasn't added multiple times
            question_answers[label["Question"]] +=  [label["Question"] + "_" + answer for answer in label["Answer"]]  
            question_answers[label["Question"]] = list(set(question_answers[label["Question"]]))

labels = []
answers_all_questions = list(question_answers.values())
for q in answers_all_questions:
    labels.extend(q)

mlb = MultiLabelBinarizer()
mlb.fit([labels])


## Dict with Img, Question, Answer pair for each question
In the provided json, not all the images have answers for all the questions

In [67]:
labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]
with open(labels_json_path, "r") as f:
    data = json.load(f)
    
def encode_answer(answers):
    answer_binarized = mlb.transform([answers])
    return answer_binarized

no_answer_vector = np.full_like(questions_to_be_answered, fill_value=0, dtype=np.int32)

img_q_a = []
for image in data[0:2]:
    image_id = image["ImageID"]
    questions = [label["Question"] for label in image["Labels"]]
    answers_all_questions = [label["Answer"] for label in image["Labels"]]
    
    # Now do everything for this image
    # in order of the questions_to_be_included
    # if missing, answer = [0,0,0,0,....] 
    for q in questions_to_be_answered[1:2]:
        if q in questions:
            q_index = questions.index(q)
            answers = answers_all_questions[q_index]
            
            answer_vector = encode_answer([q + "_" + answer for answer in answers] )
        else:
            answer_vector = no_answer_vector
            print("Error")
        img_q_a.append((image_id, q, answer_vector))
            
print(img_q_a)

[('clb0lbwzadoyc086u0brshvx5', 'How many instrumnets are in the image?', array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])), ('cla820gl5s3vv071u18ipbr2h', 'How many instrumnets are in the image?', array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]]))]


In [69]:
mlb.inverse_transform(img_q_a[0][2])

[('How many instrumnets are in the image?_1',)]

In [71]:
img_q_a[0][2]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])