In [1]:
import os
import json
import numpy as np

In [2]:
class DatasetMixer:
    def __init__(
        self,
        stage1_train_path: str,
        stage1_valid_path: str,
        stage2_train_path: str,
        stage2_valid_path: str,
        mix_ratio: dict = {"stage1": 0.7, "stage2": 0.3},
        target_train_size: int = 80000,  # 期望的训练集总数
        target_valid_size: int = 1000,    # 期望的验证集总数
        output_dir: str = "stage2/data_mixed"
    ):
        """初始化数据集混合器
        
        Args:
            stage1_train_path: 第一阶段训练数据路径
            stage1_valid_path: 第一阶段验证数据路径
            stage2_train_path: 第二阶段训练数据路径
            stage2_valid_path: 第二阶段验证数据路径
            mix_ratio: 混合比例，字典格式，键为stage名称，值为比例
            target_train_size: 期望的训练集总数
            target_valid_size: 期望的验证集总数
            output_dir: 输出目录
        """
        self.stage1_train_path = stage1_train_path
        self.stage1_valid_path = stage1_valid_path
        self.stage2_train_path = stage2_train_path
        self.stage2_valid_path = stage2_valid_path
        self.mix_ratio = mix_ratio
        self.target_train_size = target_train_size
        self.target_valid_size = target_valid_size
        self.output_dir = output_dir
        
        # 验证比例和是否为1
        if abs(sum(mix_ratio.values()) - 1) > 1e-6:
            raise ValueError("混合比例之和必须为1")
            
        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)

    def load_data(self):
        """加载所有数据集"""
        print("正在加载数据集...")
        
        # 加载stage1数据
        with open(self.stage1_train_path, 'r', encoding='utf-8') as f:
            self.stage1_train = json.load(f)
        with open(self.stage1_valid_path, 'r', encoding='utf-8') as f:
            self.stage1_valid = json.load(f)
            
        # 加载stage2数据
        with open(self.stage2_train_path, 'r', encoding='utf-8') as f:
            self.stage2_train = json.load(f)
        with open(self.stage2_valid_path, 'r', encoding='utf-8') as f:
            self.stage2_valid = json.load(f)
            
        print(f"✓ Stage1 训练集: {len(self.stage1_train)} 条数据")
        print(f"✓ Stage1 验证集: {len(self.stage1_valid)} 条数据")
        print(f"✓ Stage2 训练集: {len(self.stage2_train)} 条数据")
        print(f"✓ Stage2 验证集: {len(self.stage2_valid)} 条数据")

    def analyze_types(self, data):
        """分析数据集中各个类型的分布
        
        Args:
            data: 要分析的数据集
        """
        type_counts = {}
        for item in data:
            type_name = item.get('type', 'unknown')
            type_counts[type_name] = type_counts.get(type_name, 0) + 1
            
        print("\n数据类型分布:")
        for type_name, count in type_counts.items():
            percentage = (count / len(data)) * 100
            print(f"- {type_name}: {count} 条 ({percentage:.2f}%)")
            
        return type_counts
    
    def mix_data(self, data1, data2, ratio1, target_size):
        """混合两个数据集
        
        Args:
            data1: 第一个数据集
            data2: 第二个数据集
            ratio1: 第一个数据集的目标比例
            target_size: 期望的混合后数据集大小
        """
        # 计算每个数据集需要的样本数
        target_size1 = int(target_size * ratio1)
        target_size2 = target_size - target_size1
        
        # 确保采样数量不超过可用数据量
        if target_size1 > len(data1) or target_size2 > len(data2):
            print("警告：目标数量超过可用数据量，将按最大可用量采样")
            # 按比例重新计算采样数量
            max_size1 = min(len(data1), target_size1)
            max_size2 = min(len(data2), target_size2)
            target_size1 = max_size1
            target_size2 = max_size2
        
        print(f"数据集1大小: {len(data1)}, 采样数量: {target_size1}")
        print(f"数据集2大小: {len(data2)}, 采样数量: {target_size2}")
        
        # 随机采样
        selected1 = np.random.choice(len(data1), target_size1, replace=False)
        selected2 = np.random.choice(len(data2), target_size2, replace=False)
        
        # 合并数据
        mixed_data = []
        for idx in selected1:
            mixed_data.append(data1[idx])
        for idx in selected2:
            mixed_data.append(data2[idx])
        
        # 打乱数据顺序
        np.random.shuffle(mixed_data)
        
        print(f"混合后数据集大小: {len(mixed_data)}")
        return mixed_data
    
    def process(self):
        """处理并保存混合数据集"""
        # 1. 加载数据
        self.load_data()
        
        # 2. 混合训练集
        print("\n混合训练集...")
        mixed_train = self.mix_data(
            self.stage1_train,
            self.stage2_train,
            self.mix_ratio["stage1"],
            self.target_train_size
        )
        
        # 3. 混合验证集
        print("\n混合验证集...")
        mixed_valid = self.mix_data(
            self.stage1_valid,
            self.stage2_valid,
            self.mix_ratio["stage1"],
            self.target_valid_size
        )
        
        # 4. 分析数据类型分布
        print("\n训练集类型分布:")
        train_type_stats = self.analyze_types(mixed_train)
        
        print("\n验证集类型分布:")
        valid_type_stats = self.analyze_types(mixed_valid)
        
        # 5. 保存混合后的数据
        train_path = os.path.join(self.output_dir, 'train.json')
        valid_path = os.path.join(self.output_dir, 'valid.json')
        
        with open(train_path, 'w', encoding='utf-8') as f:
            json.dump(mixed_train, f, ensure_ascii=False, indent=4)
            
        with open(valid_path, 'w', encoding='utf-8') as f:
            json.dump(mixed_valid, f, ensure_ascii=False, indent=4)
            
        print("\n数据保存完成：")
        print(f"- 混合训练集: {train_path} ({len(mixed_train)} 条数据)")
        print(f"- 混合验证集: {valid_path} ({len(mixed_valid)} 条数据)")

In [3]:
def main():
    mixer = DatasetMixer(
        stage1_train_path='stage1/data_final/train.json',
        stage1_valid_path='stage1/data_final/valid.json',
        stage2_train_path='stage2/data_final/train.json',
        stage2_valid_path='stage2/data_final/valid.json',
        mix_ratio={"stage1": 0.7, "stage2": 0.3},
        target_train_size=80000,  # 期望的训练集总数
        target_valid_size=1000,    # 期望的验证集总数
        output_dir='stage2/data_mixed'
    )
    
    mixer.process()



In [4]:
if __name__ == "__main__":
    main()

正在加载数据集...
✓ Stage1 训练集: 80000 条数据
✓ Stage1 验证集: 1000 条数据
✓ Stage2 训练集: 41520 条数据
✓ Stage2 验证集: 1000 条数据

混合训练集...
数据集1大小: 80000, 采样数量: 56000
数据集2大小: 41520, 采样数量: 24000
混合后数据集大小: 80000

混合验证集...
数据集1大小: 1000, 采样数量: 700
数据集2大小: 1000, 采样数量: 300
混合后数据集大小: 1000

训练集类型分布:

数据类型分布:
- dialogue: 6030 条 (7.54%)
- translation: 9178 条 (11.47%)
- instruction: 56000 条 (70.00%)
- story_generation: 8792 条 (10.99%)

验证集类型分布:

数据类型分布:
- instruction: 700 条 (70.00%)
- translation: 124 条 (12.40%)
- story_generation: 111 条 (11.10%)
- dialogue: 65 条 (6.50%)

数据保存完成：
- 混合训练集: stage2/data_mixed/train.json (80000 条数据)
- 混合验证集: stage2/data_mixed/valid.json (1000 条数据)
