Data is saved in /data/datasets/OmniMedVQA/OmniMedVQA 

Checkpoint is saved in /home/fayang/checkpoints/Qwen2.5-VL-3B-Instruct

# Data process

In [1]:
import json

json_path = "/data/datasets/OmniMedVQA/OmniMedVQA/QA_information/Open-access/ACRIMA.json"
with open(json_path, "r") as f:
    acrima_data = json.load(f)

print(f"Loaded {len(acrima_data)} items from {json_path}")

restricted_json_path = "/data/datasets/OmniMedVQA/OmniMedVQA/QA_information/Restricted-access/AIDA.json"
with open(restricted_json_path, "r") as f:
    aida_data = json.load(f)

print(f"Loaded {len(aida_data)} items from {restricted_json_path}")


Loaded 159 items from /data/datasets/OmniMedVQA/OmniMedVQA/QA_information/Open-access/ACRIMA.json
Loaded 340 items from /data/datasets/OmniMedVQA/OmniMedVQA/QA_information/Restricted-access/AIDA.json


In [2]:
# Example data from the open-access dataset
acrima_data[0]

{'dataset': 'ACRIMA',
 'question_id': 'ACRIMA_0000',
 'question_type': 'Modality Recognition',
 'question': 'What imaging technique was employed to obtain this picture?',
 'gt_answer': 'Fundus imaging',
 'image_path': 'Images/ACRIMA/Im553_g_ACRIMA.png',
 'option_A': 'PET scan',
 'option_B': 'CT scan',
 'option_C': 'Blood test',
 'option_D': 'Fundus imaging',
 'modality_type': 'Fundus Photography'}

In [3]:
# Example data from the restricted-access dataset
aida_data[0]

{'dataset': 'AIDA',
 'question_id': 'AIDA_0000',
 'question_type': 'Modality Recognition',
 'question': 'What imaging modality was used to capture this image?',
 'gt_answer': 'Confocal laser endomicroscopy',
 'image_path': '${dataset_root_path}/AIDA-E_1/CLE_celiachy_test/test_036_VA_26.jpg',
 'option_A': 'Angiography',
 'option_B': 'Confocal laser endomicroscopy',
 'option_C': 'Nuclear medicine imaging',
 'option_D': 'Thermography',
 'modality_type': 'Endoscopy'}

In [4]:
import os

open_access_dir = "/data/datasets/OmniMedVQA/OmniMedVQA/QA_information/Open-access"
open_access_files = os.listdir(open_access_dir)
print("Files in Open-access directory:", open_access_files)

restricted_access_dir = "/data/datasets/OmniMedVQA/OmniMedVQA/QA_information/Restricted-access"
restricted_access_files = os.listdir(restricted_access_dir)
print("Files in Restricted-access directory:", restricted_access_files)


Files in Open-access directory: ['PAD-UFES-20.json', 'Monkeypox Skin Image 2022.json', 'CoronaHack.json', 'Covid CT.json', 'ISIC2019.json', 'NLM- Malaria Data.json', 'OCT & X-Ray 2017.json', 'DeepDRiD.json', 'PALM2019.json', 'Pulmonary Chest Shenzhen.json', 'MHSMA.json', 'Mura.json', 'SARS-CoV-2 CT-scan.json', 'Covid-19 tianchi.json', 'Knee Osteoarthritis.json', 'ISIC2020.json', 'COVIDx CXR-4.json', 'OLIVES.json', 'Fitzpatrick 17k.json', 'Adam Challenge.json', 'HuSHeM.json', 'Chest CT Scan.json', 'BioMediTech.json', 'DRIMDB.json', 'Pulmonary Chest MC.json', 'Retinal OCT-C8.json', 'ALL Challenge.json', 'Blood Cell.json', 'RUS CHN.json', 'MIAS.json', 'ACRIMA.json', 'ISIC2018.json', 'ISBI2016.json', 'BreakHis.json', 'Covid19 heywhale.json', 'JSIEC.json', 'Chest X-Ray PA.json', 'Yangxi.json', 'RadImageNet.json', 'MAlig Lymph.json', 'Diabetic Retinopathy.json', 'CRC100k.json']
Files in Restricted-access directory: ['His Can Det.json', 'AIDA.json', 'GAMMA.json', 'Br35h.json', 'Cervix93.json'

In [5]:
# Combine all json files in the open-access directory
import json

open_access_data = []
for file in open_access_files:
    with open(os.path.join(open_access_dir, file), "r") as f:
        data = json.load(f)
        open_access_data.extend(data)

print(f"Total items in Open-access dataset: {len(open_access_data)}")

Total items in Open-access dataset: 88996


In [6]:
def convert_raw_data_to_sft_data(data):
    """
    Convert raw data to SFT data

    data example:
    {
        'dataset': 'ACRIMA',
        'question_id': 'ACRIMA_0000',
        'question_type': 'Modality Recognition',
        'question': 'What imaging technique was employed to obtain this picture?',
        'gt_answer': 'Fundus imaging',
        'image_path': 'Images/ACRIMA/Im553_g_ACRIMA.png',
        'option_A': 'PET scan',
        'option_B': 'CT scan',
        'option_C': 'Blood test',
        'option_D': 'Fundus imaging',
        'modality_type': 'Fundus Photography'
    }

    SFT data example:
    {
        "image": "path/to/image.jpg",
        "problem": "question",
        "solution": "answer"
    }

    Args:
        data: list of raw data
    Returns:
        list of SFT data
    """
    sft_data = []
    for item in data:
        required_keys = ["image_path", "question", "gt_answer"]
        # Find all keys that start with "option_" in this item and add them to required_keys (if not already present)
        option_keys = [k for k in item.keys() if k.startswith("option_")]
        assert len(option_keys) > 0, f"No option keys found in item: {item}"
        required_keys.extend([k for k in option_keys if k not in required_keys])
        sft_data.append({
            "image": item["image_path"],
            "problem": item["question"] + "\n" + "\n".join([f"{k[-1]}: {item[k]}" for k in option_keys]),
            "solution": "<answer> " + item["gt_answer"] + " </answer>"})

    return sft_data

# Convert open-access data to SFT data
open_access_sft_data = convert_raw_data_to_sft_data(open_access_data)


In [7]:
# Show the first item of the SFT data
open_access_sft_data[0]

{'image': 'Images/PAD-UFES-20/uncompresses_data/imgs_part_3/PAT_1449_1554_932.png',
 'problem': 'What imaging modality was used to capture the image?\nA: Positron emission tomography (PET).\nB: Dermoscopy.\nC: X-ray.\nD: CT scan.',
 'solution': '<answer> Dermoscopy. </answer>'}

In [8]:
# Save the SFT data to a json file
with open("/data/datasets/OmniMedVQA/OmniMedVQA/open_access_sft_data.json", "w") as f:
    json.dump(open_access_sft_data, f)

# Load the SFT data from a json file
with open("/data/datasets/OmniMedVQA/OmniMedVQA/open_access_sft_data.json", "r") as f:
    open_access_sft_data = json.load(f)

In [None]:
import os
from datasets import DatasetDict, Dataset, Features, Value, Image as DatasetImage
from contextlib import contextmanager
from tqdm import tqdm
from PIL import Image as PILImage

save_path = "open_access_sft_data_hf"  # relative path; will be created in current working directory

# Define dataset features, storing the image path, not a PIL Image
features = Features({
    "image": DatasetImage(),
    "problem": Value("string"),
    "solution": Value("string"),
})

@contextmanager
def pushd(path):
    prev = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(prev)

dataset_root = "/data/datasets/OmniMedVQA/OmniMedVQA"

hf_dict = {
    "image": [],
    "problem": [],
    "solution": [],
}

def load_image_from_path(image_path):
    try:
        img = PILImage.open(image_path)
        return img
    except Exception as e:
        print(f"Error loading image {image_path}: {str(e)}. Image path: {image_path}")
        return None

with pushd(dataset_root):
    for item in tqdm(open_access_sft_data):
        hf_dict["image"].append(load_image_from_path(item["image"]))
        hf_dict["problem"].append(item["problem"])
        hf_dict["solution"].append(item["solution"])

    # Place the SFT data into the 'train' split of a DatasetDict
    train_dataset = Dataset.from_dict(hf_dict, features=features)
    open_access_sft_dataset = DatasetDict({"train": train_dataset})
    



100%|██████████| 88996/88996 [00:04<00:00, 18297.76it/s]


In [35]:
with pushd(dataset_root):
    print(train_dataset[0])

    # Save the dataset to disk
    open_access_sft_dataset.save_to_disk(os.path.join(save_path))




{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1620x1620 at 0x798D0E3D29D0>, 'problem': 'What imaging modality was used to capture the image?\nA: Positron emission tomography (PET).\nB: Dermoscopy.\nC: X-ray.\nD: CT scan.', 'solution': '<answer> Dermoscopy. </answer>'}


Saving the dataset (8/138 shards):   6%|▌         | 5160/88996 [00:25<06:55, 201.78 examples/s]


TypeError: Mask must be a pyarrow.Array of type boolean

In [34]:
with pushd(dataset_root):
    print(train_dataset[5162])

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1444x1444 at 0x798D12C0B950>, 'problem': 'Is there any indication of abnormalities in these images?\nA: In this image, there are no apparent abnormalities. It represents a normal or fundus of high myopia.\nB: This image shows a severe abnormality in the fundus.\nC: The abnormalities in this image are consistent with age-related macular degeneration.\nD: The anomalies in this image indicate the presence of glaucoma.', 'solution': '<answer> In this image, there are no apparent abnormalities. It represents a normal or fundus of high myopia. </answer>'}


DatasetDict({
    train: Dataset({
        features: ['image', 'problem', 'solution'],
        num_rows: 88996
    })
})

: 