In [219]:
import datetime
import mlcroissant as mlc

use_jsonl = False

# FileObjects and FileSets define the resources of the dataset.
distribution = [
    # GFR_Dataset is hosted on a GitHub repository:
    mlc.FileObject(
        id="github-repository",
        name="github-repository",
        description="Generalized Firing Rate Neurons repository on GitHub.",
        content_url="https://github.com/AllenInstitute/GRNN",
        encoding_format="git+https",
        sha256="main",
    ),
    # Within that repository, a FileSet lists all JSONL files:
    mlc.FileObject(
        id="gfr_dataset.json",
        name="gfr_dataset.json",
        description="JSON hosted on the GitHub repository.",
        content_url="model/gfr_dataset.json",
        contained_in=["github-repository"],
        encoding_format="application/json"
    ),
]
record_sets = [
    # RecordSets contains records in the dataset.
    mlc.RecordSet(
        id="dataset",
        name="dataset",
        # Each record has one or many fields...
        fields=[
            # Fields can be extracted from the FileObjects/FileSets.
            mlc.Field(
                id="dataset/cell_id",
                name="cell_id",
                description="Cell id of the cell.",
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    # extract=mlc.Extract(column="cell_id"),
                    # extract=mlc.Extract(json_path="x['cell_id']"),
                ),
            ),
            mlc.Field(
                id="dataset/cre-line",
                name="cre-line",
                description="Cre-line of the cell.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="cre-line"),
                ),
            ),
            mlc.Field(
                id="dataset/bin_size",
                name="bin_size",
                description=(
                    "Bin size."
                ),
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="bin_size"),
                ),
            ),
            mlc.Field(
                id="dataset/actv_bin_size",
                name="actv_bin_size",
                description=(
                    "Activation bin size."
                ),
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="actv_bin_size"),
                ),
            ),
            mlc.Field(
                id="dataset/val_evr",
                name="val_evr",
                description=(
                    "Explained variance ratio of the GFR model on the validation dataset."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="val_evr"),
                ),
            ),
            mlc.Field(
                id="dataset/test_evr",
                name="test_evr",
                description=(
                    "Explained variance ratio of the GFR model on the test dataset."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="test_evr"),
                ),
            ),
            mlc.Field(
                id="dataset/train_loss",
                name="train_loss",
                description=(
                    "Train loss of the GFR model."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="train_loss"),
                ),
            ),
            mlc.Field(
                id="dataset/test_loss",
                name="test_loss",
                description=(
                    "Test loss of the GFR model."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="gfr_dataset.json",
                    extract=mlc.Extract(column="test_loss"),
                ),
            ),
            mlc.Field(
                id="dataset/params",
                name="params",
                description=(
                    "Model parameters."
                ),
                sub_fields = [
                    mlc.Field(
                        id="dataset/params/a",
                        name="a",
                        description="Input current history kernel parameters.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            file_set="gfr_dataset.json",
                            # extract=mlc.Extract(json_path="x['params']['a']"),
                            # extract=mlc.Extract(column="params"),
                            # transforms=[mlc.Transform(json_path="a[0][0]")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="dataset/params/b",
                        name="b",
                        description="Firing rate history kernel parameters.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            file_set="gfr_dataset.json",
                            # extract=mlc.Extract(json_path="x['params']['b']"),
                            # extract=mlc.Extract(column="params"),
                            # transforms=[mlc.Transform(json_path="b[0][0]")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="dataset/params/ds",
                        name="ds",
                        description="Decay coefficients.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            file_set="gfr_dataset.json",
                            # extract=mlc.Extract(json_path="x['params']['ds']"),
                            # extract=mlc.Extract(column="params"),
                            # transforms=[mlc.Transform(json_path="ds[0]")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="dataset/params/bin_size",
                        name="params_bin_size",
                        description="Bin size of the GFR model.",
                        data_types=mlc.DataType.INTEGER,
                        repeated=True,
                        source=mlc.Source(
                            file_set="gfr_dataset.json",
                            # extract=mlc.Extract(json_path="x['params']['bin_size']"),
                            # extract=mlc.Extract(column="params"),
                            # transforms=[mlc.Transform(json_path="bin_size")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="dataset/params/g",
                        name="params_g",
                        description="Activation function.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            file_set="gfr_dataset.json",
                            # extract=mlc.Extract(json_path="x['params']['g']"),
                            # extract=mlc.Extract(column="params"),
                            # transforms=[mlc.Transform(json_path="g")],
                        ),
                        sub_fields = [
                            mlc.Field(
                                id="dataset/params/g/max_current",
                                name="max_current",
                                description="Maximum current.",
                                data_types=mlc.DataType.FLOAT,
                                repeated=True,
                                source=mlc.Source(
                                    # extract=mlc.Extract(json_path="x['params']['g']['max_current']")
                                    # extract=mlc.Extract(column="params"),
                                    # transforms=[mlc.Transform(json_path="params.g.max_current")],
                                ),
                            ),
                            mlc.Field(
                                id="dataset/params/g/max_firing_rate",
                                name="max_firing_rate",
                                description="Maximum firing rate.",
                                data_types=mlc.DataType.FLOAT,
                                repeated=True,
                                source=mlc.Source(
                                    file_set="gfr_dataset.json",
                                    # extract=mlc.Extract(json_path="x['params']['g']['max_firing_rate']")
                                    # extract=mlc.Extract(column="params"),
                                    # transforms=[mlc.Transform(json_path="g.max_firing_rate")],
                                ),
                            ),
                            mlc.Field(
                                id="dataset/params/g/poly_coeff",
                                name="poly_coeff",
                                description="Polynomial coefficients of the activation function.",
                                data_types=mlc.DataType.FLOAT,
                                repeated=True,
                                source=mlc.Source(
                                    file_set="gfr_dataset.json",
                                    # extract=mlc.Extract(json_path="x['params']['g']['poly_coeff']")
                                    # extract=mlc.Extract(column="params"),
                                    # transforms=[mlc.Transform(json_path="g.poly_coeff")],
                                ),
                            ),
                            mlc.Field(
                                id="dataset/params/g/b",
                                name="g_b",
                                description="Firing threshold of the activation function.",
                                data_types=mlc.DataType.FLOAT,
                                repeated=True,
                                source=mlc.Source(
                                    file_set="gfr_dataset.json",
                                    # extract=mlc.Extract(json_path="x['params']['g']['b']")
                                    # extract=mlc.Extract(column="params"),
                                    # transforms=[mlc.Transform(json_path="g.b")],
                                ),
                            ),
                            mlc.Field(
                                id="dataset/params/g/bin_size",
                                name="g_bin_size",
                                description="Bin size of the activation function.",
                                data_types=mlc.DataType.INTEGER,
                                repeated=True,
                                source=mlc.Source(
                                    file_set="gfr_dataset.json",
                                    # extract=mlc.Extract(json_path="x['params']['g']['bin_size']")
                                    # extract=mlc.Extract(column="params"),
                                    # transforms=[mlc.Transform(json_path="g.bin_size")],
                                ),
                            ),
                        ]
                    ),
                ],
            ),
        ],
    )
]

# Metadata contains information about the dataset.
metadata = mlc.Metadata(
    name="GFR_Dataset",
    # Descriptions can contain plain text or markdown.
    description=(
        "A dataset of over 1000 biologically-derived, parameterized, and differentiable neuronal models."
    ),
    cite_as=(
        "@article{gfr2024, title={A dataset of differentiable biologically-derived single neuron models}, "
        " author={Anonymous}, year={2024},"
        " eprint={2024.0000}, archivePrefix={arXiv}, primaryClass={cs.CL} }"
    ),
    url="https://github.com/Anonymous/GRNN",
    license="https://creativecommons.org/licenses/by/4.0/",
    version="1.0.0",
    distribution=distribution,
    #record_sets=record_sets,
)

In [220]:
print(metadata.issues.report())





In [221]:
import json

with open("croissant.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=2)
  #print(content)
  f.write(content)
  f.write("\n")  # Terminate file with newline

In [217]:
dataset = mlc.Dataset(jsonld="croissant.json")
records = dataset.records(record_set="jsonl")

for i, record in enumerate(records):
  print(record)
  if i > 10:
    break

GenerationError: An error occured during the streaming generation of the dataset, more specifically during the operation Read(jsonl-files)