# Polish

## prompt

In [None]:
system_prompt = ""
# 读取你的提示词 或直接在此书写
with open("prompt.txt", 'r', encoding='utf-8') as f:
    system_prompt += f.read()
print(system_prompt)

## Load data

In [None]:
# 读取你希望处理的数据集

dataset = None

In [None]:
# 运行该单元格以可视化部分结果
# 例如，以huggingface的API读取的数据集可以输出它的每列属性
dataset

DatasetDict({
    train: Dataset({
        features: ['year', 'month', 'date', 'page', 'slice', 'text', 'containing'],
        num_rows: 28240
    })
})

## API

调用LLM提供的API

为加快调用速度，此处使用异步编程

In [None]:

import asyncio
from datasets import load_dataset
import json
import openai
from tqdm.asyncio import tqdm

In [None]:
# 选取一部分数据，此处使用全部

small_batch = [dataset[idx] for idx in range(len(dataset))]
print(type(small_batch), len(small_batch))

<class 'list'> 28240


In [None]:
# 根据供应商提供的接口调用
# 参考： https://api-docs.deepseek.com/zh-cn/api/create-chat-completion
BASE_URL = ""
API_KEY = ""

async def async_query_openai(query_message):
    from openai import AsyncOpenAI
    client = AsyncOpenAI(
        base_url=BASE_URL,
        api_key=API_KEY
    )

    
    response = await client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                    "role": "user",
                    "content": query_message 
            }
        ],
        # 以下参数可根据实际需求调整
        max_tokens=4096,
        temperature=0.01,
        top_p=0.7,
        # top_k=50,
        frequency_penalty=1,
        # stop=["<|eot_id|>"],
        stream=False
    
    )

    if not response:
        return 'null'
    return response.choices[0].message.content



# 处理单个文本的函数
async def process_text(text):
    prompt = system_prompt + text
    return await async_query_openai(prompt)

# 将结果写入 .jsonl 文件的函数以保存结果
def write_results_to_jsonl(rawtexts, results, filename):
    with open(filename, 'a', encoding='utf-8') as f:  # 注意改为'a'模式，追加到文件中
        for raw, result in zip(rawtexts, results):
            entry = {
                "origin":raw,
                "response": result
            }
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

# 主函数，运行 asyncio 事件循环和写入结果
async def main():
    import nest_asyncio
    nest_asyncio.apply()
    batch_size = 500
    for i in tqdm(range(0, len(small_batch), batch_size)):
        current_batch = small_batch[i:i+batch_size]
        print(f"Processing batch {i // batch_size + 1}...")
        results = await process_all_texts(current_batch)
        write_results_to_jsonl(current_batch, results, 'results.jsonl')
        print(f"Batch {i // batch_size + 1} written to results.jsonl")
        


# 处理所有文本的函数，调整为处理一批文本
async def process_all_texts(onebatch):
    semaphore = asyncio.Semaphore(10000)
    async def process_one_text(text):
        async with semaphore:
            return await process_text(text)
    
    tasks = []
    for idx in range(len(onebatch)):
        text = onebatch[idx]['text']
        tasks.append(process_one_text(text))
    results = await asyncio.gather(*tasks)  # 使用 gather 等待所有任务完成并保持顺序
    return results

In [None]:
# 启动主函数
await main()

## 输出结果

读取输出的结果用于检查、后续处理

In [66]:
import jsonlines
 
file_jsonl_path = "results.jsonl"

extract_results = []

with open(file_jsonl_path, encoding='utf-8') as file:
    for onetest in jsonlines.Reader(file):
        extract_results.append(onetest['response'])