## Hugging Face Fine-tuning Datasets

This exists to convert the *.jsonl template to a local Hugging Face dataset that we save/load using `save_to_disk` and `load_from_disk` respectively. To make easier at the beginning, just matching the sample `HuggingFaceM4/the_cauldron` dataset format with minimal adjustment (downloading image data versus a url, adding system, removing user, removing source):

```json
{
    "images" = [PIL.Image]
    "texts" = [
        {
            "user": "Question: How many actions are depicted in the diagram?\nChoices:\nA. 6.\nB. 4.\nC. 8.\nD. 7.\nAnswer with the letter.",
            "assistant": "Answer: D",
            "source": "TQA"
        }
    ]
}
```

In [1]:
import sys

!{sys.executable} -m pip install Pillow --quiet

In [3]:
from collections import defaultdict
from datasets import Dataset
import json
from PIL import Image
import requests

# Load source data

training_data = []
with open('training-data/training.jsonl', 'r') as file:
    for line in file:
        training_data.append(json.loads(line))

evaluation_data = []
with open('training-data/evaluation.jsonl', 'r') as file:
    for line in file:
        evaluation_data.append(json.loads(line))

# Transform to target dataset style

output_training_data = defaultdict(list)
for td in training_data:
    img_url = td["messages"][1]["content"][0]["image_url"]["url"]
    img = Image.open(requests.get(img_url, stream=True).raw)
    output_training_data["images"].append([ img ])
    output_training_data["messages"].append([
        {
            "system": td["messages"][0]["content"],
            "assistant": td["messages"][2]["content"]
        }
    ])
train_dataset = Dataset.from_dict(output_training_data)
train_dataset.save_to_disk("training-data/training.hf")

output_evaluation_data = defaultdict(list)
for ed in evaluation_data:
    img_url = ed["messages"][1]["content"][0]["image_url"]["url"]
    img = Image.open(requests.get(img_url, stream=True).raw)
    output_evaluation_data["images"].append([ img ])
    output_evaluation_data["messages"].append([
        {
            "system": ed["messages"][0]["content"],
            "assistant": ed["messages"][2]["content"]
        }
    ])
eval_dataset = Dataset.from_dict(output_evaluation_data)
eval_dataset.save_to_disk("training-data/evaluation.hf")



Saving the dataset (1/1 shards): 100%|██████████| 112/112 [00:00<00:00, 20441.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 17/17 [00:00<00:00, 2892.62 examples/s]
