In [1]:
import os
import json
import base64

# Libraries for handling images
from PIL import Image
from io import BytesIO

In [4]:
# Directory where NDJSON files are stored
INPUT_DIR = r"dataset"
# Output directories
OUTPUT_JSON_DIR = "output/json"
OUTPUT_IMAGE_DIR = "output/images"

# Ensure output directories exist
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)
os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)


In [5]:
# Counter for incrementing IDs
game_id = 0

# Process all `.ndjson` files in the directory
for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".ndjson"):
        file_path = os.path.join(INPUT_DIR, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                try:
                    data = json.loads(line.strip())

                    # Extract image data
                    image_data = data["State"].pop("Image state", None)  # Remove image from structure
                    if image_data:
                        image_bytes = base64.b64decode(image_data)
                        image = Image.open(BytesIO(image_bytes))
                        image_path = os.path.join(OUTPUT_IMAGE_DIR, f"{game_id}.png")
                        image.save(image_path, "PNG")

                    # Prepare the JSON metadata
                    output_json = {
                        "id": game_id,
                        "State": {
                            "Fen notation": data["State"]["Fen notation"],
                            "Pgn notation": data["State"]["Pgn notation"]
                        },
                        "player_to_play": data["player_to_play"],
                        "Best_5_moves": data["Best_5_moves"]
                    }

                    # Save JSON metadata
                    json_path = os.path.join(OUTPUT_JSON_DIR, f"{game_id}.json")
                    with open(json_path, "w", encoding="utf-8") as json_file:
                        json.dump(output_json, json_file, indent=4)

                    game_id += 1  # Increment ID for next game

                except Exception as e:
                    print(f"Error processing entry in {filename}: {e}")