In [None]:
DATASET_NAME = 'Flowers-101'
datasets_datastore_uri = "azureml://subscriptions/dbd697c3-ef40-488f-83e6-5ad4dfb78f9b/resourcegroups/rbhimani-rg/workspaces/rbhimani-eastus/datastores/datasets"
jsonl_path = f'Benchmarks/{DATASET_NAME}/Benchmark_TEST_MC.jsonl'

In [None]:
from azureml.fsspec import AzureMachineLearningFileSystem
import pandas as pd

fs = AzureMachineLearningFileSystem(datasets_datastore_uri)
df = pd.read_json(datasets_datastore_uri + "/paths/" + jsonl_path, lines=True)

In [None]:
# get the text input
text_labels_array = df['label'].unique()
text_labels = ','.join(text_labels_array)
print(text_labels)

In [None]:
# decode images
import base64
import os
def read_image(fs, image_path):
    with fs.open(image_path, "rb") as f:
        return f.read()

data_out = []
for index, row in df.iterrows():
    sample = {}
    image_dir = './data'
    datastore_image_path = row['image_url'].split(DATASET_NAME)[1]
    image_path = datasets_datastore_uri + '/paths/Benchmarks/' + DATASET_NAME + datastore_image_path
    sample['image'] = base64.b64encode(read_image(fs, image_path)).decode("utf-8")
    sample['text'] = text_labels
    sample['label'] = row['label']
    data_out.append(sample)

In [None]:
# write processed jsonl file
import json
OUTPUT_FILE_NAME = os.path.join("./output_jsonls", f'{DATASET_NAME}_processed.jsonl')
with open(OUTPUT_FILE_NAME, 'w') as f:
    for sample in data_out:
        f.write(json.dumps(sample) + '\n')

In [None]:
# register data asset
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential

# Connect to the AzureML workspace


ml_client = MLClient.from_config(
    DefaultAzureCredential()
)

# Set the version number of the data asset (for example: '1')
VERSION = "1"

# Set the path, supported paths include:
# local: './<path>/<file>' (this will be automatically uploaded to cloud storage)
# blob:  'wasbs://<container_name>@<account_name>.blob.core.windows.net/<path>/<file>'
# ADLS gen2: 'abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/<file>'
# Datastore: 'azureml://datastores/<data_store_name>/paths/<path>/<file>'
path = OUTPUT_FILE_NAME

# Define the Data asset object
my_data = Data(
    path=path,
    type=AssetTypes.URI_FILE,
    name=DATASET_NAME,
    version=VERSION,
)

# Create the data asset in the workspace
ml_client.data.create_or_update(my_data)