## 数据集制作

In [1]:
# docx --> txt
from docx import Document

# 读取 .docx 文件
def docx_to_text(docx_file, txt_file):
    doc = Document(docx_file)
    with open(txt_file, 'w', encoding='utf-8') as f:
        for para in doc.paragraphs:
            f.write(para.text + '\n')

# 调用函数转换
docx_file = 'xingfa.docx'  # 你的 .docx 文件路径
txt_file = 'output.txt'    # 转换后的 .txt 文件路径
# docx_to_text(docx_file, txt_file)

print(f'已将 {docx_file} 转换为 {txt_file}')

已将 xingfa.docx 转换为 output.txt


In [2]:
# 使用bert进行chunk切分
! git clone https://www.modelscope.cn/tiansz/bert-base-chinese.git

正克隆到 'bert-base-chinese'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 20 (delta 3), reused 0 (delta 0), pack-reused 0[K
接收对象中: 100% (20/20), 153.57 KiB | 1.14 MiB/s, 完成.
处理 delta 中: 100% (3/3), 完成.


In [4]:
import torch
from transformers import BertTokenizer, BertModel
import re
import os
from scipy.spatial.distance import cosine


def get_sentence_embedding(sentence, model, tokenizer):
    """
    获取句子的嵌入表示

    参数:
    sentence (str): 输入句子
    model (BertModel): 预训练的BERT模型
    tokenizer (BertTokenizer): BERT分词器

    返回:
    numpy.ndarray: 句子的嵌入向量
    """
    # 使用分词器处理输入句子，并转换为模型输入格式
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # 使用模型获取输出，不计算梯度
    with torch.no_grad():
        outputs = model(**inputs)
    # 返回最后一层隐藏状态的平均值作为句子嵌入
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


def split_text_by_semantic(text, max_length, similarity_threshold=0.5):
    """
    基于语义相似度对文本进行分块

    参数:
    text (str): 输入的长文本
    max_length (int): 每个文本块的最大长度（以BERT分词器的token为单位）
    similarity_threshold (float): 语义相似度阈值，默认为0.5

    返回:
    list: 分割后的文本块列表
    """
    # 加载BERT模型和分词器
    tokenizer = BertTokenizer.from_pretrained('./bert-base-chinese')
    model = BertModel.from_pretrained('./bert-base-chinese')
    model.eval()  # 设置模型为评估模式

    # 按句子分割文本（使用常见的中文标点符号）
    sentences = re.split(r'(。|！|？|；)', text)
    # 重新组合句子和标点
    sentences = [s + p for s, p in zip(sentences[::2], sentences[1::2]) if s]

    chunks = []
    current_chunk = sentences[0]
    # 获取当前chunk的嵌入表示
    current_embedding = get_sentence_embedding(current_chunk, model, tokenizer)

    for sentence in sentences[1:]:
        # 获取当前句子的嵌入表示
        sentence_embedding = get_sentence_embedding(sentence, model, tokenizer)
        # 计算当前chunk和当前句子的余弦相似度
        similarity = 1 - cosine(current_embedding, sentence_embedding)

        # 如果相似度高于阈值且合并后不超过最大长度，则合并
        if similarity > similarity_threshold and len(tokenizer.tokenize(current_chunk + sentence)) <= max_length:
            current_chunk += sentence
            # 更新当前chunk的嵌入表示
            current_embedding = (current_embedding + sentence_embedding) / 2
        else:
            # 否则，保存当前chunk并开始新的chunk
            chunks.append(current_chunk)
            current_chunk = sentence
            current_embedding = sentence_embedding

    # 添加最后一个chunk
    if current_chunk:
        chunks.append(current_chunk)

    return chunks


def read_text_file(file_path):
    """
    读取文本文件

    参数:
    file_path (str): 文件路径

    返回:
    str: 文件内容
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


def save_chunks_to_files(chunks, output_dir):
    """
    将分割后的文本块保存到文件

    参数:
    chunks (list): 文本块列表
    output_dir (str): 输出目录路径
    """
    # 如果输出目录不存在，则创建
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 将每个文本块保存为单独的文件
    for i, chunk in enumerate(chunks):
        chunk_file_path = os.path.join(output_dir, f"chunk_{i + 1}.txt")
        with open(chunk_file_path, 'w', encoding='utf-8') as file:
            file.write(chunk)
        print(f"已保存第 {i + 1} 个文本块到 {chunk_file_path}")


# 主程序

# 设置输入和输出路径
input_file_path = './output.txt'  # 替换为你的长文本文件路径
output_dir = './saveChunk/'  # 替换为你希望保存文本块的目录路径

# 读取长文本
long_text = read_text_file(input_file_path)

# 设置每个文本块的最大分词数量和相似度阈值
max_length = 2048  # 可根据需要调整
similarity_threshold = 0.5  # 可根据需要调整

# 分割长文本
text_chunks = split_text_by_semantic(long_text, max_length, similarity_threshold)

# 保存分割后的文本块到指定目录
save_chunks_to_files(text_chunks, output_dir)




已保存第 1 个文本块到 ./saveChunk/chunk_1.txt
已保存第 2 个文本块到 ./saveChunk/chunk_2.txt
已保存第 3 个文本块到 ./saveChunk/chunk_3.txt
已保存第 4 个文本块到 ./saveChunk/chunk_4.txt
已保存第 5 个文本块到 ./saveChunk/chunk_5.txt
已保存第 6 个文本块到 ./saveChunk/chunk_6.txt
已保存第 7 个文本块到 ./saveChunk/chunk_7.txt
已保存第 8 个文本块到 ./saveChunk/chunk_8.txt
已保存第 9 个文本块到 ./saveChunk/chunk_9.txt
已保存第 10 个文本块到 ./saveChunk/chunk_10.txt
已保存第 11 个文本块到 ./saveChunk/chunk_11.txt
已保存第 12 个文本块到 ./saveChunk/chunk_12.txt
已保存第 13 个文本块到 ./saveChunk/chunk_13.txt
已保存第 14 个文本块到 ./saveChunk/chunk_14.txt
已保存第 15 个文本块到 ./saveChunk/chunk_15.txt
已保存第 16 个文本块到 ./saveChunk/chunk_16.txt
已保存第 17 个文本块到 ./saveChunk/chunk_17.txt
已保存第 18 个文本块到 ./saveChunk/chunk_18.txt
已保存第 19 个文本块到 ./saveChunk/chunk_19.txt
已保存第 20 个文本块到 ./saveChunk/chunk_20.txt
已保存第 21 个文本块到 ./saveChunk/chunk_21.txt
已保存第 22 个文本块到 ./saveChunk/chunk_22.txt
已保存第 23 个文本块到 ./saveChunk/chunk_23.txt
已保存第 24 个文本块到 ./saveChunk/chunk_24.txt
已保存第 25 个文本块到 ./saveChunk/chunk_25.txt
已保存第 26 个文本块到 ./saveChunk/chunk_26.txt
已保存第

### 使用API进行数据集生成

In [5]:
import json
import os
import time
import re
from typing import List, Dict
from openai import OpenAI
import logging
import backoff
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor, as_completed

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 从环境变量中获取 API 密钥
api_key = "???"

# 初始化 OpenAI 客户端
client = OpenAI(base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key=api_key)

def read_file(file_path: str) -> str:
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def generate_single_entry(text: str) -> Dict:
    prompt = f"""
    基于以下文本，生成1个用于指令数据集的高质量条目。条目应该直接关联到给定的文本内容，提出相关的问题或任务。
    请确保生成多样化的指令类型，例如：
    - 分析类："分析..."
    - 比较类："比较..."
    - 解释类："解释..."
    - 评价类："评价..."
    - 问答类："为什么..."

    文本内容：
    {text}

    请以下面的格式生成条目，确保所有字段都有适当的内容：
    {{
        "instruction": "使用上述多样化的指令类型之一，提出一个具体的、与文本相关的问题或任务",
        "input": "如果需要额外的上下文信息，请在这里提供，否则留空",
        "output": "对instruction的详细回答或任务的完成结果"
    }}
    确保所有生成的内容都与给定的文本直接相关，生成的是有效的JSON格式，并且内容高质量、准确、详细。
    """

    try:
        response = client.chat.completions.create(
            model="qwen-plus-0919",  # 尝试使用自己选择的 模型
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,  # 增加温度以提高多样性
            max_tokens=4098
        )
        logger.info(f"API 响应: {response.choices[0].message.content}")

        json_match = re.search(r'\{.*\}', response.choices[0].message.content, re.DOTALL)
        if json_match:
            entry = json.loads(json_match.group())
            required_keys = ['instruction', 'input', 'output']
            if isinstance(entry, dict) and all(key in entry for key in required_keys):
                # 根据 input 是否为空来设置 text 字段
                if entry['input'].strip():
                    entry[
                        'text'] = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.### Instruction: {entry['instruction']}\n### Input: {entry['input']}\n### Response: {entry['output']}"
                else:
                    entry[
                        'text'] = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.### Instruction: {entry['instruction']}\n### Input: {entry['input']}\n### Response: {entry['output']}"

                logger.info("成功生成完整条目")
                return entry
            else:
                logger.warning("JSON 解析成功，但缺少必要字段")
                return {}
        else:
            logger.error("无法从API响应中提取有效的JSON")
            return {}

    except Exception as e:
        logger.error(f"生成条目时发生错误: {str(e)}")
        return {}

def process_file(file_path: str, entries_per_file: int) -> List[Dict]:
    dataset = []
    try:
        text = read_file(file_path)
        for j in range(entries_per_file):
            logger.info(f"  生成第 {j + 1}/{entries_per_file} 个条目")
            entry = generate_single_entry(text)
            if entry and all(key in entry for key in ['instruction', 'input', 'output', 'text']):
                dataset.append(entry)
                logger.info(f"  成功生成 1 个完整条目")
            else:
                logger.warning(f"  跳过不完整的条目")
            time.sleep(2)  # 在请求之间增加延迟到2秒
    except Exception as e:
        logger.error(f"处理文件 {file_path} 时发生未知异常: {str(e)}")
    return dataset

def generate_dataset(folder_path: str, entries_per_file: int = 2) -> List[Dict]:
    dataset = []
    files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith(".txt")]

    with ThreadPoolExecutor(max_workers=4) as executor:  # 调整 max_workers 数量以适应你的硬件资源
        futures = [executor.submit(process_file, file_path, entries_per_file) for file_path in files]
        for future in as_completed(futures):
            try:
                dataset.extend(future.result())
            except Exception as e:
                logger.error(f"处理未来任务时发生未知异常: {str(e)}")

    return dataset

def save_dataset_as_parquet(dataset: List[Dict], output_file: str):
    schema = pa.schema([
        ('instruction', pa.string()),
        ('input', pa.string()),
        ('output', pa.string()),
        ('text', pa.string())
    ])

    arrays = [
        pa.array([entry['instruction'] for entry in dataset]),
        pa.array([entry['input'] for entry in dataset]),
        pa.array([entry['output'] for entry in dataset]),
        pa.array([entry['text'] for entry in dataset])
    ]

    table = pa.Table.from_arrays(arrays, schema=schema)
    pq.write_table(table, output_file)

if __name__ == "__main__":
    input_folder = "./saveChunk"  # 指定输入文件夹路径
    output_file = "instruction_dataset.parquet"

    logger.info("开始生成数据集")
    dataset = generate_dataset(input_folder, entries_per_file=10)
    save_dataset_as_parquet(dataset, output_file)
    logger.info(f"数据集已生成并保存到 {output_file}")
    logger.info(f"共生成 {len(dataset)} 个有效条目")


2024-10-01 14:20:59,770 - INFO - 开始生成数据集
2024-10-01 14:20:59,772 - INFO -   生成第 1/10 个条目
2024-10-01 14:20:59,772 - INFO -   生成第 1/10 个条目
2024-10-01 14:20:59,772 - INFO -   生成第 1/10 个条目
2024-10-01 14:20:59,776 - INFO -   生成第 1/10 个条目
2024-10-01 14:21:00,020 - INFO - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-10-01 14:21:00,023 - ERROR - 生成条目时发生错误: Error code: 400 - {'error': {'code': 'AccessDenied.Unpurchased', 'param': None, 'message': 'Access to model denied. Please make sure you are eligible for using the model.', 'type': 'AccessDenied.Unpurchased'}, 'id': 'chatcmpl-fede870d-2f57-9b99-8f51-ee15ecae7e9e'}
2024-10-01 14:21:00,039 - INFO - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-10-01 14:21:00,040 - INFO - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-10-01 14:21:00

In [6]:
import json
import pandas as pd
from rich import print

# Load local Parquet dataset
parquet_file_path = './instruction_dataset.parquet'
dataset = pd.read_parquet(parquet_file_path)

# Print the columns of the dataset to verify
print("Dataset Columns:", dataset.columns)

# Convert dataset to list of dictionaries
json_data = dataset.to_dict(orient='list')

# Print the first row to verify
print("First Row:", {key: json_data[key][0] for key in json_data})

# Initialize list to store Alpaca format data
alpaca_data = []

# Process each entry in the dataset
for i in range(len(json_data['instruction'])):
    instruction = json_data['instruction'][i]
    input_text = json_data['input'][i]
    original_output = json_data['output'][i]

    # Create Alpaca format entry
    alpaca_entry = {
        "instruction": instruction,
        "input": input_text,
        "output": original_output
    }
    alpaca_data.append(alpaca_entry)

# Save in Alpaca format
with open("alpaca_dataset.json", "w") as file:
    json.dump(alpaca_data, file, indent=4)

print("Alpaca format dataset saved as 'alpaca_dataset.json'")


IndexError: list index out of range