In [None]:
import io
import os
import re
import sys
from PIL import Image

import numpy as np
import pandas as pd

sys.path.append(os.path.abspath('..'))

from dataset_utility import label_col_map, filter_one_condition_scin_clinical

# Create output folder
# dataset = "derm12345"
dataset = "scin_clinical" # Get a service account key before running this
# dataset = "hiba"
output_folder = f"../../datasets/{dataset}"
os.makedirs(output_folder, exist_ok=True)
metadata_path = os.path.join(output_folder, "metadata.csv")

In [None]:
from google.cloud import storage
from google.oauth2 import service_account

def connect_to_gcs(gcp_project, bucket_name):
    """
    Connect to Google Cloud Storage and return the bucket.
    """
    # Path to your service account key file
    key_path = "/path/to/your/service-account-key.json"

    # Create credentials object
    credentials = service_account.Credentials.from_service_account_file(key_path)

    # Client for querying GCS
    client = storage.Client(credentials=credentials, project=gcp_project) 

    # Bucket object for loading files
    bucket = client.bucket(bucket_name) 
    return bucket

In [None]:
def merge_columns(row, prefix):
# Merge the columns with the same prefix
    combined = []
    for col in row.keys():
        if col.startswith(f"{prefix}_") and row[col] == "YES":
            combined.append(col[len(f"{prefix}_"):])
    return ','.join(combined)

In [None]:
from google.cloud import storage

if dataset == "scin":
    gcs_bucket = connect_to_gcs("dx-scin-public", "dx-scin-public-data")
    
    # Merge the metadata CSV
    cases_df = pd.read_csv(io.BytesIO(gcs_bucket.blob("dataset/scin_cases.csv").download_as_string()), dtype={'case_id': str})
    labels_df = pd.read_csv(io.BytesIO(gcs_bucket.blob("dataset/scin_labels.csv").download_as_string()), dtype={'case_id': str})
    cases_df['case_id'] = cases_df['case_id'].astype(str)
    labels_df['case_id'] = labels_df['case_id'].astype(str)
    cases_and_labels_df = pd.merge(cases_df, labels_df, on='case_id')

    # Merge the columns with the same prefix
    for field in ['race_ethnicity', 'textures', 'body_parts', 'condition_symptoms', 'other_symptoms']:
        cases_and_labels_df[field] = cases_and_labels_df.apply(lambda row: merge_columns(row, field), axis=1)
        cases_and_labels_df.drop(columns=[col for col in cases_and_labels_df.columns if col.startswith(f"{field}_")], 
                                 inplace=True)

    cases_and_labels_df['symptoms'] = cases_and_labels_df['condition_symptoms'].str.cat(cases_and_labels_df['other_symptoms'], 
                                                                                        sep=',', na_rep='')
    cases_and_labels_df.drop(columns=['condition_symptoms', 'other_symptoms'], inplace=True)

    # Rename the columns 'image_1_shot_type',... to match the match the format for pd.wide_to_long()
    cases_and_labels_df.rename(columns=lambda col: re.sub(r'image_(\d+)_(.+)', r'image_\2_\1', col), inplace=True)

    # Unpivot the image fields
    metadata_df = pd.wide_to_long(
        cases_and_labels_df,
        stubnames=['image_path', 'image_shot_type', 'dermatologist_gradable_for_skin_condition',
                   'dermatologist_gradable_for_fitzpatrick_skin_type', 'dermatologist_fitzpatrick_skin_type_label'],
        i=[col for col in cases_and_labels_df.columns if not re.search(r'_\d+$', col)],  # Remaining columns that are not numbered
        j='image_index',
        sep='_',
        suffix='\\d+'
    ).reset_index() # Retain the remaining columns

    # Remove rows with NaN image paths
    metadata_df = metadata_df.dropna(subset=['image_path'])

    # Iterate over the image_path column and download each image
    for image_path in metadata_df['image_path']:
        if isinstance(image_path, str):  # Ensure the path is valid
            # Download the image from the GCS bucket
            blob = gcs_bucket.blob(image_path)

            # Extract the image filename
            image_filename = os.path.basename(image_path)
            local_image_path = os.path.join(output_folder, image_filename)

            # Check if the image already exists locally
            if os.path.exists(local_image_path):
                continue

            with open(local_image_path, 'wb') as image_file:
                image_data = blob.download_as_bytes()

                # Convert the image data to an RGB PIL Image and save it as JPEG
                image = Image.open(io.BytesIO(image_data))
                image.convert("RGB").save(local_image_path, format="JPEG")
    
    metadata_df['image_path'] = metadata_df['image_path'].apply(os.path.basename)
    metadata_df.to_csv(metadata_path, index=False)

In [None]:
from datasets import load_dataset

if dataset == "derm12345":
    # Login using e.g. `huggingface-cli login` to access this dataset
    # derm12345 = load_dataset("abdurrahimyilmaz/derm12345_synthetic_data", split="train")
    derm12345 = load_dataset("abdurrahimyilmaz/derm12345_synthetic_data", split="test")
    for sample in derm12345:
        image = sample['image']
        image_id = sample['image_id']

        # Save image as JPG
        image_path = os.path.join(output_folder, image_id)
        image.convert("RGB").save(image_path, format="JPEG")

In [None]:
import json

# Generate taxonomy file for the HIBA/SCIN dataset
if dataset == "hiba" or dataset == "scin_clinical":
    metadata_df = pd.read_csv(metadata_path)
    label_col = label_col_map[dataset]
    if dataset == "scin_clinical": metadata_df = filter_one_condition_scin_clinical(metadata_df)
    classes = sorted(metadata_df[label_col].dropna().unique())
    print(classes)
    taxonomy_path = os.path.join(output_folder, "taxonomy.json")
    with open(taxonomy_path, 'w') as f:
        json.dump(classes, f, indent=4)