In [1]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, RobertaTokenizer

In [2]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', cache_dir='../cache')
tokenizer = RobertaTokenizer.from_pretrained('roberta-large', cache_dir='../cache')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [3]:
# use pandas to read simCSE-wiki.txt
# download from https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt
wiki_text_file = 'wiki1m_for_simcse.txt'
wiki = pd.read_csv(wiki_text_file, sep = '\t', header = None)
wiki.columns = ['text']
# use Dataset.from_pandas to convert to dataset
wiki_dataset = Dataset.from_pandas(wiki, split= "train")
wiki_dataset

Dataset({
    features: ['text'],
    num_rows: 995447
})

In [4]:
def prepare_features(examples):
    
    total = len(examples['text'])
    # total = batch_size
    
    # Avoid "None" fields 
    for idx in range(total):
        if examples['text'][idx] is None:
            examples['text'][idx] = " "

    # add prompt
    sentences = ["This sentence : \"[ "+s+" ]\" means [MASK] !" for s in examples['text']]   # BERT
    # sentences = ["This sentence : \"["+s+"]\" means <mask> !" for s in examples['text']]     # Roberta

    # set max_length here:
    sent_features = tokenizer(sentences, max_length=64, truncation=True, padding="max_length")

    return sent_features

In [5]:
train_dataset = wiki_dataset.map(prepare_features, batched=True, remove_columns=['text'], batch_size=5000)

Map:   0%|          | 0/995447 [00:00<?, ? examples/s]

In [6]:
# train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask']) # BERT
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask']) # Roberta

In [7]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 995447
})

In [8]:
# save to disk for reuse
train_dataset.save_to_disk("wiki_for_robertalarge_77_with_mask")

Saving the dataset (0/1 shards):   0%|          | 0/995447 [00:00<?, ? examples/s]

***