In [1]:
import torch
from transformers import AutoTokenizer
from datasets import load_dataset

print("torch version:", torch.__version__)
print("env ok")


  from .autonotebook import tqdm as notebook_tqdm


torch version: 2.9.1+cpu
env ok


In [2]:
# Step 1: basic imports

from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
import re
import os


In [3]:
# Step 2: load raw dataset

raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
raw_dataset


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating test split: 100%|████████████████████████████████████████████| 4358/4358 [00:00<00:00, 544675.85 examples/s]
Generating train split: 100%|████████████████████████████████████████| 36718/36718 [00:00<00:00, 1669139.07 examples/s]
Generating validation split: 100%|██████████████████████████████

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [4]:
# Step 3: collect raw texts

def collect_texts(dataset_dict, max_samples_per_split=None):
    """
    Collect 'text' from all splits into a single list.
    """
    texts = []
    for split_name, ds in dataset_dict.items():
        n = len(ds)
        if max_samples_per_split is not None:
            n = min(n, max_samples_per_split)
        for i in range(n):
            t = ds[i]["text"]
            if t is not None:
                texts.append(t)
    return texts

# test for only 5000 
texts_raw = collect_texts(raw_dataset, max_samples_per_split=5000)

len(texts_raw), texts_raw[0][:200]


(13118, '')

In [5]:
# Step 4: define cleaning function

def clean_text(text: str):
    """
    EN:
      - lowercase
      - collapse whitespace
      - drop too-short docs
    """
    # 1) 
    text = text.lower()
    # 2) 
    text = re.sub(r"\s+", " ", text).strip()
    
    # 3) 
    num_words = len(text.split())
    if num_words < 50:
        return None
    
    return text


In [6]:
# Step 5: clean + deduplicate

def clean_and_dedup(texts):
    cleaned = []
    seen = set()  

    for t in tqdm(texts, desc="Cleaning texts"):
        c = clean_text(t)
        if c is None:
            continue
        if c in seen:
            continue
        seen.add(c)
        cleaned.append(c)
    return cleaned

texts_clean = clean_and_dedup(texts_raw)

len(texts_clean), texts_clean[0][:200]


Cleaning texts: 100%|█████████████████████████████████████████████████████████| 13118/13118 [00:00<00:00, 74696.65it/s]


(4810,
 'robert boulter is an english film , television and theatre actor . he had a guest @-@ starring role on the television series the bill in 2000 . this was followed by a starring role in the play herons ')

In [7]:
# Step 6: load tokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# some tokenizer doesn't have pad_token，needs setting manully
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.pad_token, tokenizer.eos_token


('<|endoftext|>', '<|endoftext|>')

In [8]:
# Step 7: tokenize and chunk

def tokenize_texts(texts, tokenizer, block_size=512, min_chunk_len=32):
    """
    EN:
      - encode each cleaned document into token IDs
      - split into chunks of block_size
      - drop chunks shorter than min_chunk_len
    """
    all_token_ids = []

    for t in tqdm(texts, desc="Tokenizing texts"):
        ids = tokenizer.encode(t, add_special_tokens=True)
        # chunk every article
        for i in range(0, len(ids), block_size):
            chunk = ids[i : i + block_size]
            if len(chunk) < min_chunk_len:
                continue
            all_token_ids.append(chunk)
    
    return all_token_ids

tokenized_chunks = tokenize_texts(texts_clean, tokenizer, block_size=512, min_chunk_len=32)

len(tokenized_chunks), tokenized_chunks[0][:20]


Tokenizing texts: 100%|██████████████████████████████████████████████████████████| 4810/4810 [00:01<00:00, 3209.58it/s]


(4813,
 [305,
  4835,
  275,
  2852,
  353,
  318,
  281,
  46932,
  2646,
  837,
  5581,
  290,
  21421,
  8674,
  764,
  339,
  550,
  257,
  8319,
  2488])

In [9]:
# Step 8: custom Dataset

class PretrainDataset(Dataset):
    """
    Simple dataset for pretraining - each item is one token ID sequence.
    """
    def __init__(self, tokenized_chunks):
        self.data = tokenized_chunks

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ids = self.data[idx]
        return torch.tensor(ids, dtype=torch.long)
    

dataset = PretrainDataset(tokenized_chunks)
len(dataset), dataset[0][:20]


(4813,
 tensor([  305,  4835,   275,  2852,   353,   318,   281, 46932,  2646,   837,
          5581,   290, 21421,  8674,   764,   339,   550,   257,  8319,  2488]))

In [10]:
# Step 9: define collate_fn for padding

def collate_fn(batch):
    """
    batch: list[torch.Tensor], each tensor is shape [seq_len]

      - find max length in the batch
      - pad all sequences to max length using pad_token_id
      - create attention_mask (1 for real tokens, 0 for padding)
    """
    # max length of current batch
    max_len = max(x.size(0) for x in batch)

    input_ids = []
    attention_masks = []

    for x in batch:
        pad_len = max_len - x.size(0)
        # construct padding 
        pad_tensor = torch.full((pad_len,), tokenizer.pad_token_id, dtype=torch.long)
        padded = torch.cat([x, pad_tensor], dim=0)

        # attention_mask: raw 1，padding 0
        mask = torch.cat([torch.ones_like(x), torch.zeros(pad_len, dtype=torch.long)], dim=0)

        input_ids.append(padded)
        attention_masks.append(mask)

    input_ids = torch.stack(input_ids, dim=0)         # [batch_size, max_len]
    attention_masks = torch.stack(attention_masks, 0) # [batch_size, max_len]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks
    }


In [11]:
# Step 10: create DataLoader

batch_size = 8

loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

# pick 1 batch for test
batch = next(iter(loader))
batch["input_ids"].shape, batch["attention_mask"].shape


(torch.Size([8, 260]), torch.Size([8, 260]))

In [12]:
# Step 11: save sample batches

def save_sample_batches(loader, num_batches=5, save_path="sample_dataset.pt"):
    """
      - take the first num_batches batches from loader
      - save them (list of dicts) using torch.save
    """
    samples = []
    it = iter(loader)
    for _ in range(num_batches):
        try:
            batch = next(it)
        except StopIteration:
            break
        samples.append(batch)

    torch.save(samples, save_path)
    return save_path

save_path = save_sample_batches(loader, num_batches=5, save_path="sample_dataset.pt")
save_path


'sample_dataset.pt'

In [13]:
import importlib
import data_collection_preprocessing as dcp
importlib.reload(dcp)

# 调试模式：小数据
dcp.main(dev_mode=True)



[INFO] Running in DEV mode.
[INFO] Trying OpenWebText (dylanebert/openwebtext) first...
[DEV MODE] Loading dylanebert/openwebtext with at most 5000 examples.


Downloading data:   0%|                                                                      | 0/80 [00:00<?, ?files/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Downloading data:   1%|▊                                                             | 1/80 [00:08<11:16,  8.56s/files]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Downloading data:   2%|█▌                                                            | 2/80 [00:17<11:31,  8.87s/files]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip inst

Loaded 5000 raw documents from dylanebert/openwebtext.


Cleaning & deduplicating: 100%|██████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 4403.80it/s]


After cleaning & deduplication: 5000 documents remain.
Using tokenizer: gpt2 (pad_token_id=50256)


Tokenizing:   0%|                                                                             | 0/5000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1209 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing:  76%|█████████████████████████████████████████████████▎               | 3795/5000 [00:07<00:02, 505.07it/s]

Total tokenized chunks: 10000
DataLoader is ready.
Saved sample batches to sample_dataset.pt





In [14]:
dcp.main(dev_mode=False)


[INFO] Running in FULL mode (approx >1GB raw text).
[INFO] Trying OpenWebText (dylanebert/openwebtext) first...
[FULL MODE] Loading dylanebert/openwebtext with at most 250000 examples.
Loaded 250000 raw documents from dylanebert/openwebtext.


Cleaning & deduplicating: 100%|██████████████████████████████████████████████| 250000/250000 [00:56<00:00, 4403.70it/s]


After cleaning & deduplication: 250000 documents remain.
Using tokenizer: gpt2 (pad_token_id=50256)


Tokenizing:   0%|                                                                           | 0/250000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1209 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing:  15%|█████████▍                                                    | 38086/250000 [01:26<07:59, 441.79it/s]


Total tokenized chunks: 100000
DataLoader is ready.
Saved sample batches to sample_dataset.pt


In [15]:
print("Job Done")

Job Done
