In [1]:
import json
import re
import random
from datasketch import MinHash, MinHashLSH
import jieba

In [4]:
# === 1. 配置参数 ===
TARGET_CITIES = [
    '西安', '兰州', '敦煌', '乌鲁木齐', '银川', '西宁', '张掖', '嘉峪关',
    '天水', '酒泉', '武威', '金昌', '白银', '庆阳', '平凉', '定西', '陇南'
]
MAX_SAMPLES = 14000
DUPE_THRESHOLD = 0.85

# === 2. 加载中文旅游监督数据集 ===
def load_cn_tourism(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line)
                data.append({
                    "conversations": [
                        {"role": "user", "content": item['prompt']},
                        {"role": "assistant", "content": f"/no_think {item['response']}"}
                    ]
                })
            except Exception as e:
                print(f"Error loading CN item: {e}")
                continue
    print(f"Loaded {len(data)} Chinese tourism QA pairs")
    return data

# === 3. 加载并抽样西北数据集 ===
def load_nw_tourism(file_path, sample_size=4000):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            nw_data = json.load(f)
        print(f"Loaded {len(nw_data)} NW tourism items")
    except Exception as e:
        print(f"Error loading NW data: {e}")
        return []
    
    # 自动提取数据中的所有城市
    all_cities = set()
    for item in nw_data:
        # 从instruction中提取城市
        instruction = item.get('instruction', '')
        city_match = re.search(r'^(.*?)(市|地区|景区|景点|旅游)', instruction)
        if city_match and len(city_match.group(1)) >= 2:
            all_cities.add(city_match.group(1))
        
        # 从output中提取城市
        output = item.get('output', '')
        cities_in_output = re.findall(r'([\u4e00-\u9fa5]{2,5}?(?:市|地区|景区|景点))', output)
        for city in cities_in_output:
            all_cities.add(city.replace('市', '').replace('地区', ''))
    
    print(f"Detected cities in NW data: {list(all_cities)[:10]}... (total: {len(all_cities)})")
    
    # 创建城市映射：将检测到的城市映射到目标城市
    city_mapping = {}
    for city in all_cities:
        # 检查是否属于目标城市
        for target_city in TARGET_CITIES:
            if target_city in city:
                city_mapping[city] = target_city
                break
        # 如果未匹配，保留原城市名
        if city not in city_mapping:
            city_mapping[city] = city
    
    # 按城市分组数据
    city_items = {city: [] for city in set(city_mapping.values())}
    for item in nw_data:
        # 确定城市
        instruction = item.get('instruction', '')
        for origin_city, mapped_city in city_mapping.items():
            if origin_city in instruction:
                city_items[mapped_city].append(item)
                break
        else:
            # 如果未匹配，分配到"其他"类别
            city_items.setdefault('其他', []).append(item)
    
    # 分层抽样
    sampled_items = []
    min_per_city = max(1, sample_size // len(city_items))
    
    for city, items in city_items.items():
        # 确保每个城市至少有min_per_city条数据
        if len(items) > min_per_city:
            sampled_items.extend(random.sample(items, min_per_city))
        else:
            sampled_items.extend(items)
    
    # 如果不足，补充随机样本
    if len(sampled_items) < sample_size:
        remaining = sample_size - len(sampled_items)
        all_items = [item for sublist in city_items.values() for item in sublist]
        sampled_items.extend(random.sample(all_items, min(remaining, len(all_items))))
    
    print(f"Sampled {len(sampled_items)} NW tourism items from {len(city_items)} categories")
    
    # 转换为标准格式
    converted_data = []
    for item in sampled_items:
        # 处理可能的空字段
        instruction = item.get('instruction', '')
        input_text = item.get('input', '')
        output = item.get('output', '')
        
        # 构建用户内容
        user_content = instruction
        if input_text.strip():
            user_content += f"\n{input_text}"
        
        # 处理空输出
        if not output.strip():
            output = "该旅游信息暂缺，请咨询当地旅游局"
        
        converted_data.append({
            "conversations": [
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": f"/no_think {output}"}
            ]
        })
    
    return converted_data

# === 4. MinHash去重 ===
def deduplicate(data, threshold=DUPE_THRESHOLD):
    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    unique_data = []
    
    for idx, item in enumerate(data):
        assistant_content = item['conversations'][1]['content']
        
        # 创建MinHash指纹
        m = MinHash(num_perm=128)
        words = jieba.lcut(assistant_content)[:50]  # 使用结巴分词
        for word in words:
            m.update(word.encode('utf-8'))
        
        # 检查是否重复
        if not lsh.query(m):
            lsh.insert(f"item_{idx}", m)
            unique_data.append(item)
    
    dup_rate = (1 - len(unique_data)/len(data)) * 100
    print(f"Deduplication: {len(unique_data)}/{len(data)} items kept ({dup_rate:.2f}% duplication)")
    return unique_data

# === 5. 生成LoRA训练格式 ===
def convert_to_lora_format(data):
    lora_data = []
    for item in data:
        # 构建多轮对话格式
        messages = [
            {"role": "system", "content": "你是一名专业的旅游助手，熟悉中国各地旅游景点和行程规划"},
            {"role": "user", "content": item['conversations'][0]['content']},
            {"role": "assistant", "content": item['conversations'][1]['content']}
        ]
        
        lora_data.append({
            "messages": messages,
            "no_think": True  # 启用快速响应模式
        })
    return lora_data

# === 6. 生成质量报告 ===
def generate_quality_report(data, output_file):
    # 1. 基本统计
    total_items = len(data)
    word_counts = []
    city_coverage = {city: 0 for city in TARGET_CITIES}
    
    # 2. 关键词分析
    keywords = ["景点", "酒店", "交通", "美食", "攻略", "行程", "推荐", "门票"]
    keyword_coverage = {kw: 0 for kw in keywords}
    
    # 3. 遍历数据
    for item in data:
        content = json.dumps(item)
        
        # 城市覆盖统计
        for city in TARGET_CITIES:
            if city in content:
                city_coverage[city] += 1
        
        # 关键词统计
        for kw in keywords:
            if kw in content:
                keyword_coverage[kw] += 1
        
        # 词数统计
        assistant_text = item['messages'][-1]['content']
        word_counts.append(len(jieba.lcut(assistant_text)))
    
    # 4. 生成报告
    report = f"# 旅游数据集质量报告\n\n"
    report += f"**总样本量**: {total_items}\n"
    report += f"**平均回复长度**: {sum(word_counts)/len(word_counts):.1f} 词\n\n"
    
    report += "## 城市覆盖统计\n"
    for city, count in city_coverage.items():
        coverage_pct = (count / total_items) * 100
        report += f"- {city}: {count} 条 ({coverage_pct:.1f}%)\n"
    
    report += "\n## 关键词覆盖率\n"
    for kw, count in keyword_coverage.items():
        coverage_pct = (count / total_items) * 100
        report += f"- {kw}: {coverage_pct:.1f}%\n"
    
    # 5. 保存报告
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(report)
    print(f"Quality report saved to {output_file}")

# === 主流程 ===
if __name__ == "__main__":
    # 加载数据集
    cn_data = load_cn_tourism("merge.jsonl")
    nw_data = load_nw_tourism("LLMTourism.json")
    
    # 合并数据
    combined_data = cn_data + nw_data
    print(f"Combined data: {len(combined_data)} items")
    
    # 去重
    deduped_data = deduplicate(combined_data)
    
    # 转换为LoRA格式
    lora_data = convert_to_lora_format(deduped_data[:MAX_SAMPLES])
    
    # 保存最终数据集
    output_file = "tourism_lora_data3.jsonl"
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in lora_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"Saved {len(lora_data)} items to {output_file}")
    
    # 生成质量报告
    generate_quality_report(lora_data, "data_quality3.md")
    
    # 打印样本检查
    print("\nSample data:")
    print(json.dumps(lora_data[0], indent=2, ensure_ascii=False))

Loaded 9007 Chinese tourism QA pairs
Loaded 53280 NW tourism items
Detected cities in NW data: ['木盖库勒', '份乃至国际', '胜利纪念园景区', '也是游客和', '而西北部', '以了解白银', '在藏于汉中', '县姜州花湖景区', '构成了长春', '院所在的城']... (total: 38788)
Sampled 4000 NW tourism items from 35076 categories
Combined data: 13007 items
Deduplication: 12627/13007 items kept (2.92% duplication)
Saved 12627 items to tourism_lora_data3.jsonl
Quality report saved to data_quality3.md

Sample data:
{
  "messages": [
    {
      "role": "system",
      "content": "你是一名专业的旅游助手，熟悉中国各地旅游景点和行程规划"
    },
    {
      "role": "user",
      "content": "请问南宁饭店是几星级酒店？"
    },
    {
      "role": "assistant",
      "content": "/no_think 南宁饭店是一家４星级酒店，位于南宁市中心，拥有现代化的设施和优质的服务。酒店提供舒适的客房、美味的餐饮和完善的会议设施。我们的客房宽敞明亮，装饰精美，配备了舒适的床铺、豪华的卫浴设施以及免费的无线网络连接。酒店的餐厅供应各种美食，包括当地特色菜肴和国际美食。我们还为商务旅客提供先进的会议设施，包括宽敞的会议室和高科技的音视频设备。无论您是来南宁旅行还是商务出差，南宁饭店都会给您提供一个舒适、便利的住宿体验。欢迎您的光临！"
    }
  ],
  "no_think": true
}


In [5]:
city_check = ['天水', '张掖', '西安', '南宁']  # 您关心的城市

with open('tourism_lora_data3.jsonl', 'r', encoding='utf-8') as f:
    counts = {city:0 for city in city_check}
    for line in f:
        data = json.loads(line)
        text = str(data)
        for city in city_check:
            if city in text:
                counts[city] += 1

print("当前数据城市分布:")
for city, count in counts.items():
    print(f"{city}: {count}条")

当前数据城市分布:
天水: 73条
张掖: 60条
西安: 133条
南宁: 638条


In [6]:
# 检查西北数据是否真的被包含
import pandas as pd
df = pd.read_json('tourism_lora_data3.jsonl', lines=True)

# 检查目标城市出现次数
for city in ['天水', '张掖', '兰州']:
    count = df['messages'].apply(lambda x: city in str(x)).sum()
    print(f"{city}: {count}条")

天水: 73条
张掖: 60条
兰州: 175条


In [7]:
# 更新generate_quality_report函数（替换原函数）
def generate_quality_report(data, output_file):
    # 真实城市列表（与检测逻辑保持一致）
    REAL_CITIES = ['西安','兰州','敦煌','乌鲁木齐','银川','西宁',
                  '张掖','嘉峪关','天水','酒泉','武威','金昌',
                  '白银','庆阳','平凉','定西','陇南','南宁']
    
    # 关键词列表
    KEYWORDS = ["景点", "酒店", "交通", "美食", "攻略", 
               "行程", "推荐", "门票", "住宿", "餐厅"]
    
    # 初始化统计
    city_counts = {city: 0 for city in REAL_CITIES}
    keyword_counts = {kw: 0 for kw in KEYWORDS}
    total_length = 0
    
    for item in data:
        # 提取所有文本内容
        messages = item.get('messages', [])
        text = " ".join([msg['content'] for msg in messages if isinstance(msg, dict)])
        
        # 统计城市
        for city in REAL_CITIES:
            if city in text:
                city_counts[city] += 1
        
        # 统计关键词
        for kw in KEYWORDS:
            if kw in text:
                keyword_counts[kw] += 1
        
        # 计算回复长度
        if len(messages) >= 3:  # 确保有assistant回复
            assistant_text = messages[2]['content']
            total_length += len(jieba.lcut(assistant_text))
    
    # 生成报告
    report = [
        "# 旅游数据集质量报告",
        f"**总样本量**: {len(data)}",
        f"**平均回复长度**: {total_length/len(data):.1f} 词\n",
        "## 城市覆盖统计"
    ]
    
    # 按城市数量降序排列
    sorted_cities = sorted(
        [(city, count) for city, count in city_counts.items() if count > 0],
        key=lambda x: -x[1]
    )
    
    for city, count in sorted_cities:
        report.append(f"- {city}: {count} 条 ({count/len(data)*100:.1f}%)")
    
    report.extend([
        "\n## 关键词覆盖率",
        "| 关键词 | 出现次数 | 覆盖率 |",
        "|--------|---------|--------|"
    ])
    
    for kw in KEYWORDS:
        count = keyword_counts[kw]
        coverage = count/len(data)*100
        report.append(f"| {kw} | {count} | {coverage:.1f}% |")
    
    # 保存报告
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("\n".join(report))
    
    print(f"重新生成的质量报告已保存到 {output_file}")

In [8]:
# 加载现有数据
with open('tourism_lora_data3.jsonl', 'r', encoding='utf-8') as f:
    lora_data = [json.loads(line) for line in f]

# 生成新报告
generate_quality_report(lora_data, 'data_quality_fixed.md')

# 检查天水数据示例
tianshui_samples = [
    item for item in lora_data 
    if '天水' in str(item.get('messages', []))
][:3]
print("天水数据示例:")
for sample in tianshui_samples:
    print(json.dumps(sample, indent=2, ensure_ascii=False))
    print("---")

# 检查"景点"关键词出现情况
jingdian_samples = [
    item for item in lora_data 
    if '景点' in str(item.get('messages', []))
]
print(f"包含'景点'的数据量: {len(jingdian_samples)}")

重新生成的质量报告已保存到 data_quality_fixed.md
天水数据示例:
{
  "messages": [
    {
      "role": "system",
      "content": "你是一名专业的旅游助手，熟悉中国各地旅游景点和行程规划"
    },
    {
      "role": "user",
      "content": "请问百里柳江的门票价格是多少呢？"
    },
    {
      "role": "assistant",
      "content": "/no_think 百里柳江的门票价格根据不同类型的船只而有所不同。柳航1号船的船票价格为80元，而画舫龙船的船票则是100元。此外，如果您选择乘坐白天水上公交游览百里柳江，单程票价为3元。\n百里柳江是一个美丽的水上景区，游客可以选择不同的船只来欣赏江边美景。柳航1号船适合喜欢舒适体验的游客，而画舫龙船则展现了独特的文化韵味。白天水上公交则是一种便捷的交通选择，可以让您舒适地游览整个景区。\n在购买门票时，您可以根据个人的喜好和需求选择适合自己的船只。不同类型的船票会提供不同的船上体验，您可以根据自己的偏好做出选择。\n如果您想要更加深入地了解百里柳江的游览项目和服务，也可以前往景区的服务中心或官方网站查询更多信息。希望您在百里柳江的游览中有一个愉快而难忘的体验！"
    }
  ],
  "no_think": true
}
---
{
  "messages": [
    {
      "role": "system",
      "content": "你是一名专业的旅游助手，熟悉中国各地旅游景点和行程规划"
    },
    {
      "role": "user",
      "content": "古象温泉度假村有哪些主要景点呢？"
    },
    {
      "role": "assistant",
      "content": "/no_think 在来宾市象州古象温泉度假村，您将发现各种迷人的景点和设施。首先是19个不同类型的温泉泡池，包括特色温泉泡池、温泉游泳池、儿童戏水池以及独立贵宾温泉池。这些泡池可同时容纳1500多人泡浴，无论您是寻求放松抑或享受水疗体验，都能找到合适的选择。\n除