In [53]:
import config
import pathlib
import json
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import numpy as np

In [14]:
labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]

In [15]:
with open(labels_json_path, "r") as f:
    data = json.load(f)

In [16]:
data[0]

{'ImageID': 'clb0lbwzadoyc086u0brshvx5',
 'Labels': [{'Question': 'Are there any abnormalities in the image?',
   'AnswerType': 'Text',
   'Answer': ['Polyp']},
  {'Question': 'Are there any anatomical landmarks in the image?',
   'AnswerType': 'Text',
   'Answer': ['No']},
  {'Question': 'Are there any instruments in the image?',
   'AnswerType': 'Text',
   'Answer': ['Biopsy forceps']},
  {'Question': 'Have all polyps been removed?',
   'AnswerType': 'Yes/No',
   'Answer': ['No']},
  {'Question': 'How many findings are present?',
   'AnswerType': 'Number',
   'Answer': ['2']},
  {'Question': 'How many instrumnets are in the image?',
   'AnswerType': 'Number',
   'Answer': ['1']},
  {'Question': 'How many polyps are in the image?',
   'AnswerType': 'Number',
   'Answer': ['1']},
  {'Question': 'Is there a green/black box artefact?',
   'AnswerType': 'Yes/No',
   'Answer': ['No']},
  {'Question': 'Is there text?', 'AnswerType': 'Yes/No', 'Answer': ['Yes']},
  {'Question': 'Is this find

## Not all of the questions are for Task 1

for example

  {'Question': 'Where exactly in the image is the instrument located?',
   'AnswerType': 'segmentation',
   'Answer': 'clb0lbwzadoyc086u0brshvx5_mask.png'}

   is not 

In [17]:
questions_to_be_answered = ["What type of procedure is the image taken from?",
"How many instruments are in the image?",
"Have all polyps been removed?",
"Where in the image is the abnormality?",
"Is this finding easy to detect?",
"Where in the image is the instrument?",
"Is there a green/black box artifact?",
"Are there any abnormalities in the image?",
"Is there text?",
"Are there any anatomical landmarks in the image?",
"What color is the abnormality?",
"Are there any instruments in the image?",
"What color is the anatomical landmark?",
"Where in the image is the anatomical landmark?",
"How many findings are present?",
"What is the size of the polyp?",
"How many polyps are in the image?",
"What type of polyp is present?"]

In [18]:
question_answers = {}


for image in data:
    for label in image["Labels"]:
        if label["Question"] in questions_to_be_answered:
            if label["Question"] not in question_answers:
                question_answers[label["Question"]] = []
            
            question_answers[label["Question"]] += label["Answer"]
            question_answers[label["Question"]] = list(set(question_answers[label["Question"]]))

In [19]:
question_answers

{'Are there any abnormalities in the image?': ['Polyp',
  'No',
  'Ulcerative colitis',
  'Oesophagitis',
  'Barretts'],
 'Are there any anatomical landmarks in the image?': ['Pylorus',
  'No',
  'Ileum',
  'Not relevant',
  'Cecum',
  'Z-line'],
 'Are there any instruments in the image?': ['No',
  'Injection needle',
  'Metal clip',
  'Not relevant',
  'Biopsy forceps',
  'Polyp snare',
  'Tube'],
 'Have all polyps been removed?': ['Yes', 'No', 'Not relevant'],
 'How many findings are present?': ['5',
  '0',
  '16',
  'Pink',
  '4',
  '6',
  '1',
  '3',
  '2',
  'Yellow'],
 'How many polyps are in the image?': ['5',
  '0',
  '16',
  '4',
  '6',
  '1',
  '3',
  '2'],
 'Is there text?': ['Yes', 'No'],
 'Is this finding easy to detect?': ['Yes', 'No', 'Not relevant'],
 'What color is the abnormality?': ['burgundy',
  'Pink/Red',
  'Purple',
  'Red',
  'purple',
  'Blue',
  'Ink',
  'Yellow',
  'Pink',
  'Orange',
  'Not relevant',
  'Brown',
  'Violet',
  'Pale Pink',
  'grey',
  'Grey',

Answers seem good, besides that there are two colors in "How many findings are present?"

In [20]:
for image in data:
    for label in image["Labels"]:
        if label["Question"] in questions_to_be_answered:
            if label["Question"] == "How many findings are present?":
                print(image["ImageID"], label["Answer"])

clb0lbwzadoyc086u0brshvx5 ['2']
cla820gl5s3vv071u18ipbr2h ['1']
clb0kvxuv8zrw074y9iwrgb8n ['2']
clb0kvxv69050074y4dtl6zbt ['2']
cla820glis4ij071uf0lhhgr8 ['1']
clb0lbwyrdofo086ufh4176k0 ['0']
cla820glys53f071ubs63f5yi ['1']
clb0lbwywdojc086u2yjy7mre ['1']
clb0kvxvo9100074yfqt61zy9 ['2']
clb0lbwz9doxc086u60u1cjgd ['2']
clb0kvxvs915k074y1ezpa3xh ['1']
cla820gljs4jv071uck7ne3gg ['1']
clb0lbwyudoi0086u7em053cc ['1']
clb0lbwzadoxo086uci28gh2e ['2']
clb0kvxvi90rg074y4m986v58 ['2']
cla820glas43v071u1ndvbm1h ['1']
cl8k2u1qk1ey308326dxuddxn ['1']
cla820gl9s42f071ucqtt9qsy ['1']
cl8k2u1q91ejz0832654jgqwx ['1']
clb0lbwz1dons086u60f4hs53 ['1']
cl8k2u1qk1eyb0832btf8863x ['1']
cla820gl0s3nv071u4fgd7xgq ['1']
clb0kvxva90dc074y7z3zarjw ['2']
cla820gl0s3ob071u5z04bn3r ['1']
clb0lbwy7dns4086u9wbnd0s2 ['0']
cla820glps4rz071u6kspddxw ['1']
clb0lbwytdohk086u8wb8bt1a ['1']
clb0kvxvi90rs074yekmx3ssh ['1']
cl8k2u1qr1f5r0832h09pejru ['1']
clb0kvxva90d4074y014i8jpw ['1']
cl8k2u1qe1erv08322xfkdfxq ['1']
cla820gl

There are some bad labels in there...

How many is not a question you can answer with pink, yellow 

So, we will just exclude that image

In [21]:
y_data = []
for question, answers in question_answers.items():
    for answer in answers:
        y_data.append(f"{question}_{answer}")

In [51]:
y_data

['Are there any abnormalities in the image?_Polyp',
 'Are there any abnormalities in the image?_No',
 'Are there any abnormalities in the image?_Ulcerative colitis',
 'Are there any abnormalities in the image?_Oesophagitis',
 'Are there any abnormalities in the image?_Barretts',
 'Are there any anatomical landmarks in the image?_Pylorus',
 'Are there any anatomical landmarks in the image?_No',
 'Are there any anatomical landmarks in the image?_Ileum',
 'Are there any anatomical landmarks in the image?_Not relevant',
 'Are there any anatomical landmarks in the image?_Cecum',
 'Are there any anatomical landmarks in the image?_Z-line',
 'Are there any instruments in the image?_No',
 'Are there any instruments in the image?_Injection needle',
 'Are there any instruments in the image?_Metal clip',
 'Are there any instruments in the image?_Not relevant',
 'Are there any instruments in the image?_Biopsy forceps',
 'Are there any instruments in the image?_Polyp snare',
 'Are there any instrume

In [50]:
answers = list(question_answers.values())
answers

[['Polyp', 'No', 'Ulcerative colitis', 'Oesophagitis', 'Barretts'],
 ['Pylorus', 'No', 'Ileum', 'Not relevant', 'Cecum', 'Z-line'],
 ['No',
  'Injection needle',
  'Metal clip',
  'Not relevant',
  'Biopsy forceps',
  'Polyp snare',
  'Tube'],
 ['Yes', 'No', 'Not relevant'],
 ['5', '0', '16', 'Pink', '4', '6', '1', '3', '2', 'Yellow'],
 ['5', '0', '16', '4', '6', '1', '3', '2'],
 ['Yes', 'No'],
 ['Yes', 'No', 'Not relevant'],
 ['burgundy',
  'Pink/Red',
  'Purple',
  'Red',
  'purple',
  'Blue',
  'Ink',
  'Yellow',
  'Pink',
  'Orange',
  'Not relevant',
  'Brown',
  'Violet',
  'Pale Pink',
  'grey',
  'Grey',
  'Green',
  'Black',
  'brown',
  'White'],
 ['Pink', 'Red', 'Not relevant', 'grey', 'brown', 'White', 'Yellow'],
 ['11-20mm', 'Not relevant', '< 5mm', '>20mm', '5-10mm'],
 ['Paris ip', 'Paris is', 'Paris iia', 'Not relevant'],
 ['Gastroscopy', 'Colonoscopy'],
 ['Upper-center',
  'Lower-right',
  'Center-right',
  'Upper-left',
  'Upper-right',
  'Lower-left',
  'Not relevant'

In [45]:
y_data.__len__()

112

In [46]:
y_test = [
    ["answerc11", "answerc12"],
    ["answerc2,1", "answerc22"]
]

In [48]:
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(answers)
len(mlb.classes_)

62

In [36]:
import numpy as np

Transform binary output vectors back to text labels

In [38]:
mlb.inverse_transform(np.array([[0,1,0,0]]))

[('answerc12',)]

## Build Dataset