# Submission
> Submission example to LEPISZCZE benchmark

- title-block-banner: true

In [None]:
#| default_exp submission

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import datasets
import numpy as np

from embeddings.evaluator.evaluation_results import Predictions
from embeddings.evaluator.leaderboard import get_dataset_task
from embeddings.evaluator.submission import AveragedSubmission
from embeddings.utils.utils import get_installed_packages


It is important to note that we not only enable to easily train models but we also prepare many helpers to create a submission to the leaderboard.

We start with a couple of names.

In [None]:
DATASET_NAME = "clarin-pl/polemo2-official"
TARGET_COLUMN_NAME = "target"

We want also gahter all hyper parameters for each submission. We collecct some of params for presentation purposes.

In [None]:
hparams = {"hparam_name_1": 0.2, "hparam_name_2": 0.1}

We doing the same with python packages. We can use one of the helper methods.

In [None]:
packages = get_installed_packages()
packages[:10]

['absl-py==1.4.0',
 'aiofiles==22.1.0',
 'aiohttp==3.8.4',
 'aiosignal==1.3.1',
 'aiosqlite==0.18.0',
 'alembic==1.9.3',
 'anyio==3.6.2',
 'appdirs==1.4.4',
 'argon2-cffi-bindings==21.2.0',
 'argon2-cffi==21.3.0']

The next step is related to datasets and predictions.

In [None]:
dataset = datasets.load_dataset(DATASET_NAME)
dataset


No config specified, defaulting to: polemo2-official/all_text
Found cached dataset polemo2-official (/root/.cache/huggingface/datasets/clarin-pl___polemo2-official/all_text/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70)
100%|██████████| 3/3 [00:00<00:00, 828.48it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 6573
    })
    validation: Dataset({
        features: ['text', 'target'],
        num_rows: 823
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 820
    })
})

In [None]:
y_true = np.array(dataset["test"][TARGET_COLUMN_NAME])
y_true[:10]

array([1, 2, 2, 2, 2, 0, 0, 0, 1, 3])

It is important that we want to store not single prediction for each off object but we want to calcualted standard deviations for each object, hence we need more than one prediction.

In [None]:
predictions = [
    Predictions(
        y_true=y_true, y_pred=np.random.randint(low=0, high=4, size=len(y_true))
    )
    for _ in range(5)
]

Finaly, we can create a submission, gathering all information.

In [None]:
submission = AveragedSubmission.from_predictions(
    submission_name="my-great-submission",  # put your submission here!
    dataset_name=DATASET_NAME,
    dataset_version=dataset["train"].info.version.version_str,
    embedding_name="my-great-model",  # put your embedding name here!
    predictions=predictions,
    hparams=hparams,
    packages=packages,
    task=get_dataset_task(DATASET_NAME),
)

We can even save our submission.

In [None]:
submission.save_json(
    root="my-great-submission",
    filename="my-great-model.json",
    compress=False,
)

In [None]:
!ls my-great-submission

my-great-model.json  my-great-submission_predictions.json


In [None]:
!cat my-great-submission/my-great-model.json

{
  "submission_name": "my-great-submission",
  "dataset_name": "clarin-pl/polemo2-official",
  "dataset_version": "0.0.0",
  "embedding_name": "my-great-model",
  "hparams": {
    "hparam_name_1": 0.2,
    "hparam_name_2": 0.1
  },
  "packages": [
    "absl-py==1.4.0",
    "aiofiles==22.1.0",
    "aiohttp==3.8.4",
    "aiosignal==1.3.1",
    "aiosqlite==0.18.0",
    "alembic==1.9.3",
    "anyio==3.6.2",
    "appdirs==1.4.4",
    "argon2-cffi-bindings==21.2.0",
    "argon2-cffi==21.3.0",
    "arrow==1.2.3",
    "asttokens==2.2.1",
    "astunparse==1.6.3",
    "async-timeout==4.0.2",
    "attrs==22.2.0",
    "babel==2.11.0",
    "backcall==0.2.0",
    "beautifulsoup4==4.11.2",
    "black==21.12b0",
    "bleach==6.0.0",
    "cachetools==5.3.0",
    "catalogue==2.0.8",
    "certifi==2022.12.7",
    "cffi==1.15.1",
    "charset-normalizer==3.0.1",
    "click==8.0.4",
    "cmaes==0.9.1",
    "colorlog==6.7.0",
    "comm==0.1.2",
    "contourpy==1.0.7",
    "coverage==6.2",
    "cycler==0.11