In [1]:
import json
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F

In [2]:
class CustomDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # 假设每行都是一个JSON对象
                item = json.loads(line)
                self.data.append(item)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        # 返回单个数据项，根据需要进行预处理
        role = sample["role"]
        text = sample["text"]
        audio_feature = sample["audio_feature"]
        position_feature = sample["position"]
        return {
            "role": role,
            "text": text,
            "audio_feature": torch.tensor(audio_feature),
            "position_feature": position_feature
        }

In [3]:
dataset = CustomDataset('/content/feature.jsonl')

In [4]:
from torch.utils.data import DataLoader, random_split

# 定义数据集大小
dataset_size = len(dataset)
print(dataset_size)

# 定义训练集、验证集和测试集的大小比例（例如，60%训练，20%验证，20%测试）
train_size = int(0.85 * dataset_size)
test_size = int(0.1 * dataset_size)
val_size = dataset_size - train_size - test_size

# 随机切分数据集为训练集和临时集（包括验证集和测试集）
train_dataset, temp_dataset = random_split(dataset, [train_size, dataset_size - train_size])

# 再次随机切分临时集为验证集和测试集
val_dataset, test_dataset = random_split(temp_dataset, [val_size, test_size])

# 创建训练集、验证集和测试集的 DataLoader
batch_size = 64  # 设置每个批次的大小
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


1144
