# OpenWebText Data Preparation

In [1]:
!pip install tiktoken
!pip install datasets



In [None]:
!sudo find / -type f -exec ls -s {} + | sort -n -r | head -100

In [None]:
! sudo rm -rf /root/.cache/huggingface/datasets

In [2]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset

num_proc = 8

num_proc_load_dataset = num_proc

# Load openwebtext data from transformer
# dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset, split="train[:60%]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8013769 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
split_dataset = dataset.train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test')

enc = tiktoken.get_encoding("gpt2")

base_path = '/content/data'
if not os.path.exists(base_path):
  os.makedirs(base_path)


def process(example):
    # Append end_of_text token and store lens
    ids = enc.encode_ordinary(example['text'])
    ids.append(enc.eot_token)
    out = {'ids': ids, 'len': len(ids)}
    return out

tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

tokenizing the splits (num_proc=8):   0%|          | 0/4805856 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/2405 [00:00<?, ? examples/s]

In [5]:
directory_path = '/content/data'

for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(directory_path, f'{split}.bin')
    dtype = np.uint16
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 256

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Write to file in a sharded manner.
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

writing /content/data/train.bin: 100%|██████████| 256/256 [04:08<00:00,  1.03it/s]
writing /content/data/val.bin: 100%|██████████| 256/256 [00:00<00:00, 375.42it/s]


# **CNN Daily Mail Dataset Preparation**

In [None]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets


In [None]:
num_proc = 8

num_proc_load_dataset = num_proc

PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258
    }
)

In [None]:
dataset = load_dataset("cnn_dailymail", '3.0.0', num_proc=num_proc_load_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Setting num_proc from 8 to 3 for the train split as it only contains 3 shards.


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
def process(example):
    article_ids = enc.encode_ordinary(example['article']) # encode_ordinary ignores any special tokens
    article_ids.append(50257)

    highlights_ids = enc.encode_ordinary(example['highlights'])
    highlights_ids.append(enc.eot_token)

    data = article_ids + " summarize: " + highlights_ids
    if len(data) > 1024:
        text = [0]
    else:
        # Pad sequences to length 1024
        text = [enc.eot_token]*1024
        text[:len(data)] = data

    out = {'data': text, 'data_len': len(data), 'article_lens': [len(article_ids)]}

    return out

# tokenize the dataset
tokenized = dataset.map(
    process,
    remove_columns=['article','highlights','id'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

# Drop examples with input sequence lengths < 1024
tokenized['train'] = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
tokenized['validation'] = tokenized['validation'].filter(lambda data: len(data['data']) == 1024)
tokenized['test'] = tokenized['test'].filter(lambda data: len(data['data']) == 1024)


tokenizing the splits (num_proc=8):   0%|          | 0/287113 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/13368 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/11490 [00:00<?, ? examples/s]

Filter:   0%|          | 0/287113 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13368 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
# concatenate all the ids in each dataset into one large file we can use for training
directory_path = '/content/data/11685-HW5-Data/DailyMail-Dataset'
for split, dset in tokenized.items():
    filename = os.path.join(directory_path, f'{split}')
    np.save(filename, np.array(dset['data']))


    filename = os.path.join(directory_path, f'{split}_lens')
    np.save(filename, np.array(dset['article_lens']))

# Q-A Dataset Preparation

In [None]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

num_proc = 8

num_proc_load_dataset = num_proc

In [None]:
PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258
    }
)

In [None]:
dataset = load_dataset("squad", num_proc=num_proc_load_dataset)

def process(example):
    context_ids = enc.encode_ordinary(example['context']) # encode_ordinary ignores any special tokens
    context_ids.append(50258)

    question_ids = enc.encode_ordinary(example['question'])
    question_ids.append(50258)

    answers_ids = enc.encode_ordinary(example['answers']['text'][0])
    answers_ids.append(enc.eot_token)

    data = context_ids + question_ids + answers_ids
    if len(data) > 1024:
        text = [0]
    else:
        # Pad input sequences to length of 1024
        text = [enc.eot_token]*1024
        text[:len(data)] = data

    out = {'data': text, 'data_len': len(data), 'context_lens': [len(context_ids)+ len(question_ids)]}

    return out

# tokenize the dataset
tokenized = dataset.map(
    process,
    remove_columns=['id', 'title', 'context', 'question', 'answers'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

# Drop examples with input lengths greater than 1024
tokenized['train'] = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
tokenized['validation'] = tokenized['validation'].filter(lambda data: len(data['data']) == 1024)


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/87599 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/10570 [00:00<?, ? examples/s]

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
directory_path = '/content/data/11685-HW5-Data/QA-Dataset'
# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    filename = os.path.join(directory_path, f'{split}')
    print(np.array(dset['data']).shape)
    np.save(filename, np.array(dset['data']))


    filename = os.path.join(directory_path, f'{split}_lens')
    np.save(filename, np.array(dset['context_lens']))

(87598, 1024)
(10570, 1024)


# Sentiment Analysis Dataset Preparation


In [None]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets
from datasets import DatasetDict

In [None]:
PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'
SEP_3_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258,
        SEP_3_TOKEN : 50259
    }
)

In [None]:
dataset = load_dataset('financial_phrasebank','sentences_66agree', num_proc=num_proc_load_dataset)

Downloading data:   0%|          | 0.00/339k [00:00<?, ?B/s]

Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/4217 [00:00<?, ? examples/s]

In [None]:
def process(example):
    sentence_ids = enc.encode_ordinary(example['sentence']) # encode_ordinary ignores any special tokens
    sentence_ids.append(50259)

    sentiment_label_ids = enc.encode_ordinary(str(example['label']))
    sentiment_label_ids.append(enc.eot_token)

    data = sentence_ids + sentiment_label_ids
    if len(data) > 1024:
        text = [0]
    else:
        # Pad sequences to length 1024
        text = [enc.eot_token]*1024
        text[:len(data)] = data

    out = {'data': text, 'data_len': len(data), 'sentence_lens': [len(sentence_ids)]}

    return out

# tokenize the dataset
tokenized = dataset.map(
    process,
    remove_columns=['sentence','label'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)


tokenizing the splits (num_proc=8):   0%|          | 0/4217 [00:00<?, ? examples/s]

In [None]:
# Drop examples with input sequence lengths < 1024
complete_Dataset = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
train_test_split = complete_Dataset.train_test_split(test_size=0.1)
tokenized = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

Filter:   0%|          | 0/4217 [00:00<?, ? examples/s]

In [None]:
directory_path = '/content/data/11685-HW5-Data/Sentiment-Analysis-Dataset'
# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    filename = os.path.join(directory_path, f'{split}')
    print(np.array(dset['data']).shape)
    np.save(filename, np.array(dset['data']))


    filename = os.path.join(directory_path, f'{split}_lens')
    np.save(filename, np.array(dset['sentence_lens']))

(3795, 1024)
(422, 1024)


# NER Dataset Preparation

In [None]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

In [None]:
PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'
SEP_3_TOKEN = '<|sep1|>'
SEP_4_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258,
        SEP_3_TOKEN : 50259,
        SEP_4_TOKEN : 50260
    }
)

In [None]:
dataset = load_dataset('conll2003', num_proc=num_proc_load_dataset)

Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
def process(example):
    # token_ids = enc.encode_ordinary(example['tokens']) # encode_ordinary ignores any special tokens
    tokens_str = "[" + ", ".join(example['tokens']) + "]"
    token_ids = enc.encode_ordinary(tokens_str)
    token_ids.append(50260)

    ner_tags_str = "[" + ", ".join(map(str, example['ner_tags'])) + "]"
    # ner_tags = enc.encode_ordinary("["+", ".join([str(num) for num in example['ner_tags']]) + "]")

    ner_tags_ids = enc.encode_ordinary(ner_tags_str)
    ner_tags_ids.append(enc.eot_token)

    # print(tokens_str)
    # print("---------\n")
    # print("["+", ".join([str(num) for num in example['ner_tags']]) + "]")
    # while(1):
    #   pass

    data = token_ids + ner_tags_ids
    if len(data) > 1024:
        text = [0]
    else:
        # Pad sequences to length 1024
        text = [enc.eot_token]*1024
        text[:len(data)] = data

    out = {'data': text, 'data_len': len(data), 'tokens_lens': [len(token_ids)]}

    return out

# tokenize the dataset
tokenized = dataset.map(
    process,
    remove_columns=['id','tokens','pos_tags','chunk_tags','ner_tags'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

# Drop examples with input sequence lengths < 1024
tokenized['train'] = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
tokenized['validation'] = tokenized['validation'].filter(lambda data: len(data['data']) == 1024)
tokenized['test'] = tokenized['test'].filter(lambda data: len(data['data']) == 1024)

tokenizing the splits (num_proc=8):   0%|          | 0/14041 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/3250 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/3453 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14041 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
directory_path = '/content/data/11685-HW5-Data/NER-Dataset'
# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    filename = os.path.join(directory_path, f'{split}')
    print(np.array(dset['data']).shape)
    np.save(filename, np.array(dset['data']))


    filename = os.path.join(directory_path, f'{split}_lens')
    np.save(filename, np.array(dset['tokens_lens']))

(14041, 1024)
(3250, 1024)
(3453, 1024)
