# Make Dataset
x = (Image, Question)
y = Answer

In [78]:
import config
import pathlib
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt

In [65]:
questions_to_be_answered = ["What type of procedure is the image taken from?",
"How many instrumnets are in the image?",
"Have all polyps been removed?",
"Where in the image is the abnormality?",
"Is this finding easy to detect?",
"Where in the image is the instrument?",
"Is there a green/black box artifact?",
"Are there any abnormalities in the image?",
"Is there text?",
"Are there any anatomical landmarks in the image?",
"What color is the abnormality?",
"Are there any instruments in the image?",
"What color is the anatomical landmark?",
"Where in the image is the anatomical landmark?",
"How many findings are present?",
"What is the size of the polyp?",
"How many polyps are in the image?",
"What type of polyp is present?"]

## Data Cleaning

### Replace misspelled question
"How many instrumnets are in the image?"

-> No, in the submission check script of image clef, the question is also posed with the spelling error. Should be a minimal difference in the encoding anyways

In [66]:
# labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]
# with open(labels_json_path, "r") as f:
#     data = json.load(f)
    
# for image in data:
#     for label in image["Labels"]:
#         if label["Question"] == "How many instrumnets are in the image?":
#             label["Question"] = "How many instruments are in the image?"
            
        
# with open(labels_json_path, "w") as f:
#     json.dump(data, f, indent=True)
    

## Encoder

In [67]:
labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]


with open(labels_json_path, "r") as f:
    data = json.load(f)

question_answers = {}
for image in data:
    for label in image["Labels"]:
        if label["Question"] in questions_to_be_answered:
            if label["Question"] not in question_answers:
                question_answers[label["Question"]] = []
            
            # Add answers to the question , then make sure it wasn't added multiple times
            question_answers[label["Question"]] +=  [label["Question"] + "_" + answer for answer in label["Answer"]]  
            question_answers[label["Question"]] = list(set(question_answers[label["Question"]]))

labels = []
answers_all_questions = list(question_answers.values())
for q in answers_all_questions:
    labels.extend(q)

mlb = MultiLabelBinarizer()
mlb.fit([labels])


## Dict with Img, Question, Answer pair for each question
In the provided json, not all the images have answers for all the questions
We need to stratify by question: Are there any abnormalities in the image? later, so we need to save how that is answered for each image for each question

In [80]:
labels_json_path = list(pathlib.Path(config.data_raw_dev).rglob("*.json"))[0]
with open(labels_json_path, "r") as f:
    data = json.load(f)
    
def encode_answer(answers):
    answer_binarized = mlb.transform([answers])
    return answer_binarized

no_answer_vector = np.full_like(questions_to_be_answered, fill_value=0, dtype=np.int32)

stratify_question = "Are there any abnormalities in the image?"


img_q_a = []
for image in data:
    image_id = image["ImageID"]
    questions = [label["Question"] for label in image["Labels"]]
    answers_all_questions = [label["Answer"] for label in image["Labels"]]
    
    
    stratify_label = answers_all_questions[questions.index(stratify_question)]
    stratify_label = "_".join(stratify_label)

    # Now do everything for this image
    # in order of the questions_to_be_included
    # if missing, answer = [0,0,0,0,....] 
    for q in questions_to_be_answered:
        if q in questions:
            q_index = questions.index(q)
            answers = answers_all_questions[q_index]
            
            answer_vector = encode_answer([q + "_" + answer for answer in answers] )
        else:
            answer_vector = no_answer_vector

        # img = plt.imread(os.path.join(config.data_raw_dev, "images", f"{image_id}.jpg" ))
        # Can't do it with the image directly, since it gets huuuuuuuge (to little memory)
        img_q_a.append([image_id, q, answer_vector, stratify_label])

In [81]:
mlb.inverse_transform(img_q_a[0][2])

[('What type of procedure is the image taken from?_Colonoscopy',)]

In [82]:
X = []
y = []
stratify = []
for img, q, a, stratify_label in img_q_a:
    X.append([img, q])
    y.append(a)
    stratify.append(stratify_label)
    
# X = np.array(X)
# y = np.array(y)

In [83]:
print(f"X: {X[0]}")
print(f"y: {y[0]}")

X: [array([[[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        ...,
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        ...,
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        ...,
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       ...,

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        ...,
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        ...,
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        ...,
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]]], dtype=uint8), 'What type of procedure is the image taken from?']
y: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Need to stratify by abnormality type. That means stratifiying by question 12


12 Are there any abnormalities in the image? "No", "Polyp", "Ulcerative colitis", "Oesophagitis", ...

Stratify before creating datasets

In [84]:

X_train, X_test, y_train, y_test, stratify_train, stratify_test = train_test_split(
    X, y, stratify, test_size=0.2, random_state=42, stratify=stratify
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42, stratify=stratify_test
)

In [85]:
print("len(X_train) " + str(len(X_train)))
print("len(X_test) " + str(len(X_test)))
print("len(X_val) " + str(len(X_val)))

len(X_train) 28800
len(X_test) 3600
len(X_val) 3600


In [86]:

os.makedirs(config.data_processed_dev, exist_ok=True)
np.save(os.path.join(config.data_processed_dev, "X_train.npy"), X_train)
np.save(os.path.join(config.data_processed_dev, "y_train.npy"), y_train)
np.save(os.path.join(config.data_processed_dev, "X_test.npy"), X_test)
np.save(os.path.join(config.data_processed_dev, "y_test.npy"), y_test)
np.save(os.path.join(config.data_processed_dev, "X_val.npy"), X_val)
np.save(os.path.join(config.data_processed_dev, "y_val.npy"), y_val)

  arr = np.asanyarray(arr)


OSError: [Errno 122] Disk quota exceeded

In [75]:
X_train = np.load(os.path.join(config.data_processed_dev, "X_train.npy"), allow_pickle=True)
y_train = np.load(os.path.join(config.data_processed_dev, "y_train.npy"), allow_pickle=True)
X_test = np.load(os.path.join(config.data_processed_dev, "X_test.npy"), allow_pickle=True)
y_test = np.load(os.path.join(config.data_processed_dev, "y_test.npy"), allow_pickle=True)
X_val = np.load(os.path.join(config.data_processed_dev, "X_val.npy"), allow_pickle=True)
y_val = np.load(os.path.join(config.data_processed_dev, "y_val.npy"), allow_pickle=True)

In [76]:
tf.data.Dataset.from_tensor_slices(X_train)

2024-10-27 00:46:25.163399: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-10-27 00:46:25.163430: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:135] retrieving CUDA diagnostic information for host: nickel
2024-10-27 00:46:25.163436: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:142] hostname: nickel
2024-10-27 00:46:25.163561: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:166] libcuda reported version is: 555.42.6
2024-10-27 00:46:25.163574: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:170] kernel reported version is: 555.42.6
2024-10-27 00:46:25.163577: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:249] kernel version seems to match DSO: 555.42.6


<_TensorSliceDataset element_spec=TensorSpec(shape=(2,), dtype=tf.string, name=None)>