In [19]:
from datasets import load_dataset, load_from_disk
import numpy as np
from transformers import AutoTokenizer
import torch
domains = ['dzen', 'lenta', 'lj', 'ok', 'reddit', 'twitter', 'vk']
import os
data_dir = "/root/diplom_doremi/doremi/data_in_datasets"
data_preprocessed_dir = "/root/diplom_doremi/doremi/data_preprocessed"
labels = ['neutral', 'joy', 'anger', 'sadness', 'surprise', 'fear']
labels_ids = [3, 2, 0, 4, 5, 1]


In [20]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [21]:
dataset = load_from_disk(f"{data_dir}/{domains[0]}")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 16978
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 4245
    })
})

In [22]:
def preprocess_data(data, **kwargs):
    # take a batch of texts
    text = data["text"]
    # encode them
    encoding = tokenizer(text, return_attention_mask=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    encoding['domain_ids'] = np.full((len(text), 1), kwargs['domain'])
    return encoding

In [16]:
# encoded_dataset = dataset['train'].map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names, fn_kwargs={'train': True})
encoded_dataset = dataset['train'].map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names, fn_kwargs={'domain': 7})


Map:   0%|          | 0/16978 [00:00<?, ? examples/s]

Map: 100%|██████████| 16978/16978 [00:14<00:00, 1174.45 examples/s]


In [17]:
encoded_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'domain_ids'],
    num_rows: 16978
})

In [25]:
for domain_id, domain in enumerate(domains):
    dataset = load_from_disk(f"{data_dir}/{domain}")
    for x in ['train', 'test']:
        dataset[x] = dataset[x].map(preprocess_data, batched=True, remove_columns=['text', 'target'], fn_kwargs={'domain': domain_id})
        dataset[x] = dataset[x].remove_columns('token_type_ids')
    # os.makedirs(f"{data_preprocessed_dir}/{domain}/")
    dataset.save_to_disk(f"{data_preprocessed_dir}/{domain}")
    

Map:   0%|          | 0/16978 [00:00<?, ? examples/s]

Map: 100%|██████████| 16978/16978 [00:02<00:00, 6328.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16978/16978 [00:00<00:00, 581788.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4245/4245 [00:00<00:00, 412320.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2249/2249 [00:00<00:00, 269467.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 563/563 [00:00<00:00, 115641.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2428/2428 [00:00<00:00, 309819.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 607/607 [00:00<00:00, 114951.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 20173/20173 [00:00<00:00, 579770.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5044/5044 [00:00<00:00, 416835.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 168980/168980 [00:00<00:00, 307624.18 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 42245/42245 [0

In [26]:
test = load_from_disk(f"{data_preprocessed_dir}/{domains[1]}")
test

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'domain_ids'],
        num_rows: 2249
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'domain_ids'],
        num_rows: 563
    })
})