In [1]:
import os
import torch
import pickle
import pytz
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from pympler import asizeof
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# 加载数据集
dataset = load_dataset("Salesforce/wikitext", "wikitext-103-v1")
print("The rows of dataset:")
print(f"  train:{dataset['train'].num_rows}, test:{dataset['test'].num_rows}, validation:{dataset['validation'].num_rows}")

# 加载预训练的分词器和模型
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# GPT-2分词器没有pad_token，设置它
tokenizer.pad_token = tokenizer.eos_token

Device: cuda


Using the latest cached version of the dataset since Salesforce/wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-103-v1' at /home/lyj/.cache/huggingface/datasets/Salesforce___wikitext/wikitext-103-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Tue Dec  3 03:28:42 2024).


The rows of dataset:
  train:1801350, test:4358, validation:3760


In [3]:
def filter_embeddings(embeddings, attention_mask):
    # 计算batch中每个样本的序列长度
    origin_seq_len = attention_mask.sum(dim=1)

    filtered_embeddings = []
    for emb, seq_len in zip(embeddings, origin_seq_len):
        if seq_len == 0:
            continue
        # 去除多余padding, tensor转为numpy
        filtered_embeddings.append(emb[:seq_len].detach().cpu().numpy())

    tokens_num = sum(origin_seq_len).item()
    return filtered_embeddings, tokens_num

def get_embeddings(text, model, tokenizer, device):
    inputs = tokenizer(
        text,
        padding=True,  # 自动填充到最长序列长度
        truncation=True,  # 截断超过最大长度的序列
        max_length=5000,  # 设置最大长度
        return_tensors='pt'  # 返回 PyTorch 张量
    ).to(device)

    attention_mask = inputs['attention_mask']  # padding部分标记为0

    with torch.no_grad():
        embeddings  = model.transformer.wte(inputs['input_ids'])
    
    return filter_embeddings(embeddings, attention_mask)

In [4]:
def save_embeddings_to_file(embeddings, start_idx, end_idx):
    directory_path = './embeddings'
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

    # 获取当前东八区时间
    cst = pytz.timezone('Asia/Shanghai')
    now_cst = datetime.now(cst)
    time_string = now_cst.strftime("%Y-%m-%d_%H:%M:%S")

    file_name = f"{directory_path}/embeddings_{start_idx}_{end_idx}_{time_string}.pkl"
    with open(file_name, 'wb') as f:
        pickle.dump(embeddings, f)
    

def get_and_save_embeddings(model, tokenizer, dataset, start_idx, batch_size, batch_num, device):
    end_idx = batch_size * batch_num + start_idx
    all_embeddings = []
    all_tokens_num = 0

    for cur_idx in range(start_idx, end_idx, batch_size):
        texts = dataset['train'][cur_idx : cur_idx + batch_size]['text']
        embeddings, tokens_num = get_embeddings(texts, model, tokenizer, device)

        all_embeddings = all_embeddings + embeddings
        all_tokens_num = all_tokens_num + tokens_num

        if int(cur_idx / batch_size) % 500 == 0:
            print(f"Current sample idx({cur_idx}) has done.")

    # 保存当前 (batch_num * batch) 个样本的 embeddings
    save_embeddings_to_file(all_embeddings, start_idx, end_idx - 1)
    return all_tokens_num


def get_and_save_all_embeddings(model, tokenizer, dataset, batch_size, device):
    all_tokens_num = 0
    batch_num = 2000

    for i in range(0, len(dataset['train']), batch_size * batch_num):
    # for i in range(0, 100000, batch_size * batch_num):
        tokens_num = get_and_save_embeddings(model, tokenizer, dataset, i, batch_size, batch_num, device)
        all_tokens_num = all_tokens_num + tokens_num
        print(f"All tokens num: {all_tokens_num}.")


model.eval()
batch_size = 16
all_embeddings = get_and_save_all_embeddings(model, tokenizer, dataset, batch_size, device)

Current sample idx(0) has done.
Current sample idx(8000) has done.
Current sample idx(16000) has done.
Current sample idx(24000) has done.
All tokens num: 2070212.
Current sample idx(32000) has done.
Current sample idx(40000) has done.
Current sample idx(48000) has done.
Current sample idx(56000) has done.
All tokens num: 4136703.
Current sample idx(64000) has done.
Current sample idx(72000) has done.
Current sample idx(80000) has done.
Current sample idx(88000) has done.
All tokens num: 6182838.
Current sample idx(96000) has done.
Current sample idx(104000) has done.
Current sample idx(112000) has done.
Current sample idx(120000) has done.
All tokens num: 8279582.
Current sample idx(128000) has done.
Current sample idx(136000) has done.
Current sample idx(144000) has done.
Current sample idx(152000) has done.
All tokens num: 10335686.
Current sample idx(160000) has done.
Current sample idx(168000) has done.
Current sample idx(176000) has done.
Current sample idx(184000) has done.
All 

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [14]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
