In [1]:
import json
import re
import random
from datasketch import MinHash, MinHashLSH
import jieba

ModuleNotFoundError: No module named 'jieba'

In [None]:
# === 1. 配置参数 ===
TARGET_CITIES = ['西安', '兰州', '敦煌', '乌鲁木齐', '银川', '西宁', '张掖', '嘉峪关']
MAX_SAMPLES = 14000  # 目标数据量
DUPE_THRESHOLD = 0.85  # 去重相似度阈值

# === 2. 加载中文旅游监督数据集 ===
def load_cn_tourism(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            # 转换格式并添加指令前缀
            data.append({
                "conversations": [
                    {"role": "user", "content": item['prompt']},
                    {"role": "assistant", "content": f"/no_think {item['response']}"}
                ]
            })
    print(f"Loaded {len(data)} Chinese tourism QA pairs")
    return data

# === 3. 加载并抽样西北数据集 ===
def load_nw_tourism(file_path, sample_size=4000):
    with open(file_path, 'r', encoding='utf-8') as f:
        nw_data = json.load(f)
    
    # 城市关键词过滤函数
    def contains_target_city(text):
        text = str(text).lower()
        return any(city in text for city in TARGET_CITIES)
    
    # 过滤并抽样
    filtered_data = [
        item for item in nw_data 
        if contains_target_city(item.get('instruction', '')) or 
           contains_target_city(item.get('output', ''))
    ]
    
    # 分层抽样：确保每个城市至少有50条
    sampled_data = []
    city_counts = {city: 0 for city in TARGET_CITIES}
    
    random.shuffle(filtered_data)
    for item in filtered_data:
        for city in TARGET_CITIES:
            if city in json.dumps(item) and city_counts[city] < 50:
                sampled_data.append(item)
                city_counts[city] += 1
                break
        if len(sampled_data) >= sample_size:
            break
    
    print(f"Sampled {len(sampled_data)} NW tourism items")
    
    # 转换为标准格式
    converted_data = []
    for item in sampled_data:
        # 处理空input
        user_content = item['instruction']
        if item.get('input', '').strip():
            user_content += f"\n{item['input']}"
        
        converted_data.append({
            "conversations": [
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": f"/no_think {item['output']}"}
            ]
        })
    return converted_data

# === 4. MinHash去重 ===
def deduplicate(data, threshold=DUPE_THRESHOLD):
    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    unique_data = []
    
    for idx, item in enumerate(data):
        assistant_content = item['conversations'][1]['content']
        
        # 创建MinHash指纹
        m = MinHash(num_perm=128)
        words = jieba.lcut(assistant_content)[:50]  # 使用结巴分词
        for word in words:
            m.update(word.encode('utf-8'))
        
        # 检查是否重复
        if not lsh.query(m):
            lsh.insert(f"item_{idx}", m)
            unique_data.append(item)
    
    dup_rate = (1 - len(unique_data)/len(data)) * 100
    print(f"Deduplication: {len(unique_data)}/{len(data)} items kept ({dup_rate:.2f}% duplication)")
    return unique_data

# === 5. 生成LoRA训练格式 ===
def convert_to_lora_format(data):
    """转换为Qwen2.5 LoRA微调专用格式"""
    lora_data = []
    for item in data:
        # 构建多轮对话格式（Qwen2.5要求）
        messages = [
            {"role": "system", "content": "你是一名专业的旅游助手"},
            *item['conversations']
        ]
        
        lora_data.append({
            "messages": messages,
            "no_think": True  # 启用快速响应模式
        })
    return lora_data

# === 6. 生成质量报告 ===
def generate_quality_report(data, output_file):
    # 1. 基本统计
    total_items = len(data)
    word_counts = []
    city_coverage = {city: 0 for city in TARGET_CITIES}
    
    # 2. 关键词分析
    keywords = ["景点", "酒店", "交通", "美食", "攻略", "行程", "推荐", "门票"]
    keyword_coverage = {kw: 0 for kw in keywords}
    
    # 3. 遍历数据
    for item in data:
        content = json.dumps(item)
        
        # 城市覆盖统计
        for city in TARGET_CITIES:
            if city in content:
                city_coverage[city] += 1
        
        # 关键词统计
        for kw in keywords:
            if kw in content:
                keyword_coverage[kw] += 1
        
        # 词数统计
        assistant_text = item['messages'][-1]['content']
        word_counts.append(len(jieba.lcut(assistant_text)))
    
    # 4. 生成报告
    report = f"# 旅游数据集质量报告\n\n"
    report += f"**总样本量**: {total_items}\n"
    report += f"**平均回复长度**: {sum(word_counts)/len(word_counts):.1f} 词\n\n"
    
    report += "## 城市覆盖统计\n"
    for city, count in city_coverage.items():
        coverage_pct = (count / total_items) * 100
        report += f"- {city}: {count} 条 ({coverage_pct:.1f}%)\n"
    
    report += "\n## 关键词覆盖率\n"
    for kw, count in keyword_coverage.items():
        coverage_pct = (count / total_items) * 100
        report += f"- {kw}: {coverage_pct:.1f}%\n"
    
    # 5. 保存报告
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(report)
    print(f"Quality report saved to {output_file}")

# === 主流程 ===
if __name__ == "__main__":
    # 加载数据集
    cn_data = load_cn_tourism("chinese_tourism.jsonl")
    nw_data = load_nw_tourism("northwest_tourism.json")
    
    # 合并并去重
    combined_data = cn_data + nw_data
    deduped_data = deduplicate(combined_data)
    
    # 转换为LoRA格式
    lora_data = convert_to_lora_format(deduped_data[:MAX_SAMPLES])
    
    # 保存最终数据集
    with open("tourism_lora_data.jsonl", 'w', encoding='utf-8') as f:
        for item in lora_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"Saved {len(lora_data)} LoRA-formatted items")
    
    # 生成质量报告
    generate_quality_report(lora_data, "data_quality.md")