In [None]:
from datasets import load_dataset
import pymorphy3
from transformers import AutoTokenizer

morph = pymorphy3.MorphAnalyzer()

def check_word_is_noun(w):
    p = morph.parse(w)[0]
    return p.tag.POS == 'NOUN'

# Load the dataset
dataset = load_dataset("artemsnegirev/ru-word-games")
subsets = ["350_zagadok", "ostrova", "ugadaj_slova", "umnyasha"]

In [5]:
# Filter the dataset
dataset = dataset.filter(lambda x: x["subset"] in subsets)
dataset = dataset.filter(lambda x: check_word_is_noun(x["answer"]))

# Class encode and split
dataset = dataset.class_encode_column("subset")
dataset = dataset["train"].train_test_split(test_size=0.1, stratify_by_column="subset")

print(f"Train size: {len(dataset['train'])}")
print(f"Test size: {len(dataset['test'])}")
print(dataset)

Train size: 23075
Test size: 2564
DatasetDict({
    train: Dataset({
        features: ['subset', 'answer', 'prompt'],
        num_rows: 23075
    })
    test: Dataset({
        features: ['subset', 'answer', 'prompt'],
        num_rows: 2564
    })
})


In [7]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_function(examples):
    return tokenizer(examples["prompt"], examples["answer"], truncation=True, padding="max_length", max_length=128)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns to avoid issues during training
tokenized_dataset = tokenized_dataset.remove_columns(["prompt", "answer", "subset"])
tokenized_dataset.set_format("torch")

print(f"Tokenized train sample: {tokenized_dataset['train'][0]}")


Map: 100%|██████████| 23075/23075 [00:03<00:00, 7080.35 examples/s]
Map: 100%|██████████| 2564/2564 [00:00<00:00, 7585.82 examples/s]

Tokenized train sample: {'input_ids': tensor([  101, 79524, 38539, 60645, 65067,   545, 54696,   102,   572, 23479,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     




In [None]:
# save the tokenized dataset
tokenized_dataset.save_to_disk("../data/tokenized_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 23075/23075 [00:00<00:00, 961403.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2564/2564 [00:00<00:00, 366549.49 examples/s]


In [10]:
# test data loading

from datasets import load_from_disk

reloaded_dataset = load_from_disk("../data/tokenized_dataset")
print(f"Reloaded dataset sample: {reloaded_dataset['train'][0]}")

Reloaded dataset sample: {'input_ids': tensor([  101, 79524, 38539, 60645, 65067,   545, 54696,   102,   572, 23479,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,    

In [12]:
reloaded_dataset["test"][0]

{'input_ids': tensor([   101,    526,  90668,  10351, 109910,  23657,    543,  19710,  42876,
            117,    541,  59781,  12016,  10122,  44977,    119,    102,    553,
          87230,  10433,  10851,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0

In [14]:
import wandb
wandb.init(project="ru-word-games", name="train-transformer-from-scratch")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhse-octavarium[0m ([33mhse-octavarium-mckinsey-company[0m). Use [1m`wandb login --relogin`[0m to force relogin
