In [2]:
from transformers import GPT2Tokenizer
import numpy as np

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Example text
text = "Hello, world! This is an example of tokenization."

# Tokenize the text
tokens = tokenizer.encode(text)

# Save raw text to a file
with open("raw_text.txt", "w", encoding="utf-8") as f:
    f.write(text)

# Save tokenized data to a binary file
tokens_np = np.array(tokens, dtype=np.uint16)
tokens_np.tofile("tokenized_data.bin")

# Compare file sizes
import os
raw_text_size = os.path.getsize("raw_text.txt")
tokenized_size = os.path.getsize("tokenized_data.bin")

print(f"Raw text size: {raw_text_size} bytes")
print(f"Tokenized size: {tokenized_size} bytes")

  from .autonotebook import tqdm as notebook_tqdm


Raw text size: 49 bytes
Tokenized size: 24 bytes


In [5]:
len(list(text.encode('utf-8')))

49

In [9]:
len(tokens_np)*2

24

In [10]:
49/24

2.0416666666666665

In [1]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class PretrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = 0

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index: int):
        #
        sample = self.df.iloc[index]
        text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}"
        input_id = self.tokenizer(text).data['input_ids'][:self.max_length]
        text_len = len(input_id)
        # 没满最大长度的剩余部分
        padding_len = self.max_length - text_len
        input_id = input_id + [self.padding] * padding_len
        # 0表示不计算损失
        loss_mask = [1] * text_len + [0] * padding_len

        input_id = np.array(input_id)
        X = np.array(input_id[:-1]).astype(np.int64)
        Y = np.array(input_id[1:]).astype(np.int64)
        loss_mask = np.array(loss_mask[1:]).astype(np.int64)
        return torch.from_numpy(X), torch.from_numpy(Y), torch.from_numpy(loss_mask)