In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

# 加载数据集
dataset = load_dataset("Open-Orca/OpenOrca", split="train")

# 初始化Llama的tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

# 定义常量
END_OF_TEXT_TOKEN = tokenizer.eos_token

class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset[idx]
        question = row['question']
        response = row['response']

        input_text = question + response
        output_text = question + response + ' ' + END_OF_TEXT_TOKEN

        input_ids = self.tokenizer.encode(input_text)
        output_ids = self.tokenizer.encode(output_text)[1:]  # 去掉第一个token
        
        return {
            'input_ids': input_ids,
            'output_ids': output_ids,
            'question_len': len(self.tokenizer.encode(question))
        }

# 定义自定义collate_fn
def collate_fn(batch):
    batch_input_ids = [item['input_ids'] for item in batch]
    batch_output_ids = [item['output_ids'] for item in batch]
    question_lengths = [item['question_len'] for item in batch]
    
    max_len = max(max(len(ids) for ids in batch_input_ids), max(len(ids) for ids in batch_output_ids))
    
    input_ids_padded = []
    output_ids_padded = []
    masks = []
    
    for input_ids, output_ids, q_len in zip(batch_input_ids, batch_output_ids, question_lengths):
        input_len = len(input_ids)
        output_len = len(output_ids)
        
        # Padding input_ids and output_ids to the same length
        input_ids += [tokenizer.pad_token_id] * (max_len - input_len)
        output_ids += [tokenizer.pad_token_id] * (max_len - output_len)
        
        # Create mask: 0 for question part, 1 for response part, 0 for padding and eos_token part
        mask = [0] * q_len + [1] * (output_len - q_len) + [0] * (max_len - output_len)
        
        input_ids_padded.append(input_ids)
        output_ids_padded.append(output_ids)
        masks.append(mask)
    
    input_ids_padded = torch.tensor(input_ids_padded, dtype=torch.long)
    output_ids_padded = torch.tensor(output_ids_padded, dtype=torch.long)
    masks = torch.tensor(masks, dtype=torch.long)
    
    return input_ids_padded, output_ids_padded, masks

# 创建数据集和DataLoader
custom_dataset = CustomDataset(dataset)
dataloader = DataLoader(custom_dataset, batch_size=2, collate_fn=collate_fn)

# 示例：获取一个batch的数据
for batch in dataloader:
    input_ids, output_ids, masks = batch
    print("Input IDs:", input_ids)
    print("Output IDs:", output_ids)
    print("Masks:", masks)
    break


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input IDs: tensor([[  2610,    686,    387,   2661,    264,   7271,    315,    264,   3383,
           1156,     11,   1221,   1045,   1946,    315,    279,   3383,    624,
           1986,   3383,    374,    911,   1667,    279,   5189,  11652,    323,
          33437,    279,  11652,    311,  11765,   7662,  23752,    320,     49,
           5262,      8,  23725,   2576,    315,    279,   1352,    320,  11501,
             11,  24283,   1633,    568,    576,  68399,  23725,   2576,   7907,
           1969,    387,   1741,    429,    279,  23725,   2576,  29257,  12322,
            279,   5944,    323,  52694,    315,    279,   1946,  11652,     13,
            576,   1946,    374,    264,  11652,    323,    279,   2550,    374,
            264,   1140,    315,  23725,   2576,    315,    279,   1352,    508,
          11501,     11,  24283,     11,   1633,     60,    429,  12322,    279,
          11871,   3042,    304,    279,  11652,     13,   3197,    264,  11652,
            702, 

In [3]:
# 示例：获取一个batch的数据
for batch in dataloader:
    input_ids, output_ids, masks = batch
    print("Input IDs:", input_ids.shape)
    print("Output IDs:", output_ids.shape)
    print("Masks:", masks.shape)
    break

Input IDs: torch.Size([32, 919])
Output IDs: torch.Size([32, 576])
Masks: torch.Size([32, 919])


In [13]:
import os

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from datasets import load_dataset



class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset[idx]
        question = row['question']
        response = row['response']

        input_text = question + response
        output_text = question + response + ' ' + self.tokenizer.eos_token

        input_ids = self.tokenizer.encode(input_text)
        output_ids = self.tokenizer.encode(output_text)[1:]  # 去掉第一个token

        return {
            'input_ids': input_ids,
            'output_ids': output_ids,
            'question_len': len(self.tokenizer.encode(question))
        }


# 定义自定义collate_fn
def collate_fn(batch, tokenizer):
    batch_input_ids = [item['input_ids'] for item in batch]
    batch_output_ids = [item['output_ids'] for item in batch]
    question_lengths = [item['question_len'] for item in batch]

    max_len = max(max(len(ids) for ids in batch_input_ids), max(len(ids) for ids in batch_output_ids))

    input_ids_padded = []
    output_ids_padded = []
    masks = []

    for input_ids, output_ids, q_len in zip(batch_input_ids, batch_output_ids, question_lengths):
        input_len = len(input_ids)
        output_len = len(output_ids)

        # Padding input_ids and output_ids to the same length
        input_ids += [tokenizer.pad_token_id] * (max_len - input_len)
        output_ids += [tokenizer.pad_token_id] * (max_len - output_len)

        # Create mask: 0 for question part, 1 for response part, 0 for padding and eos_token part
        mask = [0] * q_len + [1] * (output_len - q_len) + [0] * (max_len - output_len)

        input_ids_padded.append(input_ids)
        output_ids_padded.append(output_ids)
        masks.append(mask)

    input_ids_padded = torch.tensor(input_ids_padded, dtype=torch.long)
    output_ids_padded = torch.tensor(output_ids_padded, dtype=torch.long)
    masks = torch.tensor(masks, dtype=torch.long)

    return input_ids_padded, output_ids_padded, masks


# 加载数据集
dataset = load_dataset("Open-Orca/OpenOrca", split="train")

# 初始化Llama的tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

# 定义常量
END_OF_TEXT_TOKEN = tokenizer.eos_token

# 创建数据集和DataLoader
custom_dataset = CustomDataset(dataset, tokenizer)
dataloader = DataLoader(custom_dataset, batch_size=2, collate_fn=lambda x: collate_fn(x, tokenizer))

# 示例：获取一个batch的数据
for batch in dataloader:
    input_ids, output_ids, masks = batch
    print("Input IDs:", input_ids)
    print("Output IDs:", output_ids)
    print("Masks:", masks)
    break

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input IDs: tensor([[  2610,    686,    387,   2661,    264,   7271,    315,    264,   3383,
           1156,     11,   1221,   1045,   1946,    315,    279,   3383,    624,
           1986,   3383,    374,    911,   1667,    279,   5189,  11652,    323,
          33437,    279,  11652,    311,  11765,   7662,  23752,    320,     49,
           5262,      8,  23725,   2576,    315,    279,   1352,    320,  11501,
             11,  24283,   1633,    568,    576,  68399,  23725,   2576,   7907,
           1969,    387,   1741,    429,    279,  23725,   2576,  29257,  12322,
            279,   5944,    323,  52694,    315,    279,   1946,  11652,     13,
            576,   1946,    374,    264,  11652,    323,    279,   2550,    374,
            264,   1140,    315,  23725,   2576,    315,    279,   1352,    508,
          11501,     11,  24283,     11,   1633,     60,    429,  12322,    279,
          11871,   3042,    304,    279,  11652,     13,   3197,    264,  11652,
            702, 

In [18]:
# get batch without for
batch = next(iter(dataloader))
input_ids, output_ids, masks = batch
print("Input IDs:", input_ids.shape)
print("Output IDs:", output_ids.shape)
print("Masks:", masks.shape)

Input IDs: torch.Size([2, 194])
Output IDs: torch.Size([2, 194])
Masks: torch.Size([2, 194])


In [23]:
dataloader_iter = iter(dataloader)
input_ids, output_ids, masks = next(dataloader_iter)
print("Input IDs:", input_ids.shape)
print("Output IDs:", output_ids.shape)
print("Masks:", masks.shape)

Input IDs: torch.Size([2, 194])
Output IDs: torch.Size([2, 194])
Masks: torch.Size([2, 194])


In [29]:
input_ids, output_ids, masks = next(dataloader_iter)
print("Input IDs:", input_ids.shape)
print("Output IDs:", output_ids.shape)
print("Masks:", masks.shape)

Input IDs: torch.Size([2, 458])
Output IDs: torch.Size([2, 458])
Masks: torch.Size([2, 458])
