In [175]:
import mlcroissant as mlc

# FileObjects and FileSets define the resources of the dataset.
distribution = [
    # gpt-3 is hosted on a GitHub repository:
    mlc.FileObject(
        id="github-repository",
        name="github-repository",
        description="Generalized Firing Rate Neurons repository on GitHub.",
        content_url="https://github.com/AllenInstitute/GRNN",
        encoding_format="git+https",
        sha256="main",
    ),
    # Within that repository, a FileSet lists all JSONL files:
    mlc.FileSet(
        id="jsonl-files",
        name="jsonl-files",
        description="JSON files are hosted on the GitHub repository.",
        contained_in=["github-repository"],
        encoding_format="application/jsonlines",
        includes="model/*.jsonl",
    ),
]
record_sets = [
    

    # RecordSets contains records in the dataset.
    mlc.RecordSet(
        id="jsonl",
        name="jsonl",
        # Each record has one or many fields...
        fields=[
            # Fields can be extracted from the FileObjects/FileSets.
            mlc.Field(
                id="jsonl/cell_id",
                name="cell_id",
                description="",
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="jsonl-files",
                    # Extract the field from the column of a FileObject/FileSet:
                    extract=mlc.Extract(column="cell_id"),
                ),
            ),
            mlc.Field(
                id="jsonl/cre-line",
                name="cre-line",
                description="The expected completion of the promt.",
                data_types=mlc.DataType.TEXT,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="cre-line"),
                ),
            ),
            mlc.Field(
                id="jsonl/bin_size",
                name="bin_size",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="bin_size"),
                ),
            ),
            mlc.Field(
                id="jsonl/actv_bin_size",
                name="actv_bin_size",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                data_types=mlc.DataType.INTEGER,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="actv_bin_size"),
                ),
            ),
            mlc.Field(
                id="jsonl/val_evr",
                name="val_evr",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="val_evr"),
                ),
            ),
            mlc.Field(
                id="jsonl/test_evr",
                name="test_evr",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="test_evr"),
                ),
            ),
            mlc.Field(
                id="jsonl/train_loss",
                name="train_loss",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="train_loss"),
                ),
            ),
            mlc.Field(
                id="jsonl/test_loss",
                name="test_loss",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                data_types=mlc.DataType.FLOAT,
                source=mlc.Source(
                    file_set="jsonl-files",
                    extract=mlc.Extract(column="test_loss"),
                ),
            ),
            mlc.Field(
                id="jsonl/params",
                name="params",
                description=(
                    "The machine learning task appearing as the name of the"
                    " file."
                ),
                # data_types=mlc.DataType.TEXT,
                # source=mlc.Source(
                #     file_set="jsonl-files",
                #     extract=mlc.Extract(column="params"),
                # ),
                sub_fields = [
                    mlc.Field(
                        id="jsonl/params/a",
                        name="a",
                        description="The expected completion of the promt.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            #field="id=jsonl/params",
                            file_set="jsonl-files",
                            
                            extract=mlc.Extract(column="params"),
                            transforms=[mlc.Transform(json_path="a[0][0]")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="jsonl/params/b",
                        name="b",
                        description="The expected completion of the promt.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            #field="id=jsonl/params",
                            file_set="jsonl-files",
                            
                            extract=mlc.Extract(column="params"),
                            transforms=[mlc.Transform(json_path="b[0][0]")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="jsonl/params/ds",
                        name="ds",
                        description="The expected completion of the promt.",
                        data_types=mlc.DataType.FLOAT,
                        repeated=True,
                        source=mlc.Source(
                            #field="id=jsonl/params",
                            file_set="jsonl-files",
                            
                            extract=mlc.Extract(column="params"),
                            transforms=[mlc.Transform(json_path="ds[0]")], # MUSTFIX : this is not working
                        ),
                    ),
                    mlc.Field(
                        id="jsonl/params/bin_size",
                        name="params_bin_size",
                        description="The expected completion of the promt.",
                        data_types=mlc.DataType.INTEGER,
                        repeated=True,
                        source=mlc.Source(
                            #field="id=jsonl/params",
                            file_set="jsonl-files",
                            
                            extract=mlc.Extract(column="params"),
                            transforms=[mlc.Transform(json_path="bin_size")], # MUSTFIX : this is not working
                        ),
                    ),
                    # mlc.Field(
                    #     id="jsonl/params/g",
                    #     name="params_g",
                    #     description="The expected completion of the promt.",
                    #     data_types=mlc.DataType.FLOAT,
                    #     # repeated=True,
                    #     # source=mlc.Source(
                    #     #     #field="id=jsonl/params",
                    #     #     file_set="jsonl-files",
                            
                    #     #     extract=mlc.Extract(column="params"),
                    #     #     transforms=[mlc.Transform(json_path="a[0][0:4]")],
                    #     # ),
                    #     sub_fields = [
                    #         mlc.Field(
                    #             id="jsonl/params/g/max_current",
                    #             name="max_current",
                    #             description="The expected completion of the promt.",
                    #             data_types=mlc.DataType.FLOAT,
                    #             repeated=True,
                    #             source=mlc.Source(
                    #                 #field="id=jsonl/params",
                    #                 #file_set="jsonl-files",
                                    
                    #                 extract=mlc.Extract(column="params"),
                    #                 #transforms=[mlc.Transform(json_path="params.g.max_current")],
                    #                 transforms=[mlc.Transform(json_path="bin_size")],  # MUSTFIX : this is not working
                    #             ),
                    #         ),
                    #         # mlc.Field(
                    #         #     id="jsonl/params/g/max_firing_rate",
                    #         #     name="max_firing_rate",
                    #         #     description="The expected completion of the promt.",
                    #         #     data_types=mlc.DataType.FLOAT,
                    #         #     repeated=True,
                    #         #     source=mlc.Source(
                    #         #         #field="id=jsonl/params",
                    #         #         file_set="jsonl-files",
                                    
                    #         #         extract=mlc.Extract(column="params"),
                    #         #         transforms=[mlc.Transform(json_path="g.max_firing_rate")],
                    #         #     ),
                    #         # ),
                    #         # mlc.Field(
                    #         #     id="jsonl/params/g/poly_coeff",
                    #         #     name="poly_coeff",
                    #         #     description="The expected completion of the promt.",
                    #         #     data_types=mlc.DataType.FLOAT,
                    #         #     repeated=True,
                    #         #     source=mlc.Source(
                    #         #         #field="id=jsonl/params",
                    #         #         file_set="jsonl-files",
                                    
                    #         #         extract=mlc.Extract(column="params"),
                    #         #         transforms=[mlc.Transform(json_path="g.poly_coeff")],
                    #         #     ),
                    #         # ),
                    #         # mlc.Field(
                    #         #     id="jsonl/params/g/b",
                    #         #     name="g_b",
                    #         #     description="The expected completion of the promt.",
                    #         #     data_types=mlc.DataType.FLOAT,
                    #         #     repeated=True,
                    #         #     source=mlc.Source(
                    #         #         #field="id=jsonl/params",
                    #         #         file_set="jsonl-files",
                                    
                    #         #         extract=mlc.Extract(column="params"),
                    #         #         transforms=[mlc.Transform(json_path="g.b")],
                    #         #     ),
                    #         # ),
                    #         # mlc.Field(
                    #         #     id="jsonl/params/g/bin_size",
                    #         #     name="g_bin_size",
                    #         #     description="The expected completion of the promt.",
                    #         #     data_types=mlc.DataType.INTEGER,
                    #         #     repeated=True,
                    #         #     source=mlc.Source(
                    #         #         #field="id=jsonl/params",
                    #         #         file_set="jsonl-files",
                                    
                    #         #         extract=mlc.Extract(column="params"),
                    #         #         transforms=[mlc.Transform(json_path="g.bin_size")],
                    #         #     ),
                    #         # ),
                    #     ]
                    # ),
                ],
            ),
        ],
    )
]

# Metadata contains information about the dataset.
metadata = mlc.Metadata(
    name="gpt-3",
    # Descriptions can contain plain text or markdown.
    description=(
        "Recent work has demonstrated substantial gains on many NLP tasks and"
        " benchmarks by pre-training on a large corpus of text followed by"
        " fine-tuning on a specific task. While typically task-agnostic in"
        " architecture, this method still requires task-specific fine-tuning"
        " datasets of thousands or tens of thousands of examples. By contrast,"
        " humans can generally perform a new language task from only a few"
        " examples or from simple instructions \u2013 something which current"
        " NLP systems still largely struggle to do. Here we show that scaling"
        " up language models greatly improves task-agnostic, few-shot"
        " performance, sometimes even reaching competitiveness with prior"
        " state-of-the-art fine-tuning approaches. Specifically, we train"
        " GPT-3, an autoregressive language model with 175 billion parameters,"
        " 10x more than any previous non-sparse language model, and test its"
        " performance in the few-shot setting. For all tasks, GPT-3 is applied"
        " without any gradient updates or fine-tuning, with tasks and few-shot"
        " demonstrations specified purely via text interaction with the model."
        " GPT-3 achieves strong performance on many NLP datasets, including"
        " translation, question-answering, and cloze tasks, as well as several"
        " tasks that require on-the-fly reasoning or domain adaptation, such as"
        " unscrambling words, using a novel word in a sentence, or performing"
        " 3-digit arithmetic. At the same time, we also identify some datasets"
        " where GPT-3's few-shot learning still struggles, as well as some"
        " datasets where GPT-3 faces methodological issues related to training"
        " on large web corpora. Finally, we find that GPT-3 can generate"
        " samples of news articles which human evaluators have difficulty"
        " distinguishing from articles written by humans. We discuss broader"
        " societal impacts of this finding and of GPT-3 in general."
    ),
    cite_as=(
        "@article{brown2020language, title={Language Models are Few-Shot"
        " Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and"
        " Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind"
        " Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and"
        " Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom"
        " Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and"
        " Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and"
        " Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and"
        " Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford"
        " and Ilya Sutskever and Dario Amodei}, year={2020},"
        " eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }"
    ),
    url="https://github.com/AllenInstitute/GRNN",
    distribution=distribution,
    record_sets=record_sets,
)

In [56]:
print(metadata.issues.report())





In [176]:
import json

with open("croissant.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=2)
  #print(content)
  f.write(content)
  f.write("\n")  # Terminate file with newline

In [177]:
dataset = mlc.Dataset(jsonld="croissant.json")
records = dataset.records(record_set="jsonl")

for i, record in enumerate(records):
  print(record)
  if i > 10:
    break

GenerationError: An error occured during the streaming generation of the dataset, more specifically during the operation Read(jsonl-files)