# 调用API实现baselines

In [27]:
from typing import List
from typing import Optional
from typing import Dict

import os
import json
import jsonlines
from pathlib import Path

import httpx
from openai import OpenAI

import requests

import time
import re
from tqdm import tqdm

## 主流程

In [22]:
# 你的API密钥
api_key = "sk-gNToXJPO4Ih8NDZnDloSXR780dQDEZ2Epu49wcVA1sWb4Ecw"

# 缓存ID，假设从创建缓存的响应中获取
cache_id = "cache-ezjtmpuoc6di11gdsdm1"

# 发送DELETE请求删除缓存
response = requests.delete(
    url=f"https://api.moonshot.cn/v1/caching/{cache_id}",
    headers={
        "Authorization": f"Bearer {api_key}"
    }
)

# 打印响应状态码
print(response.status_code)

# 如果需要，也可以打印响应内容
print(response.text)

404
{"error":{"message":"context cache not found: cache-ezjtmpuoc6di11gdsdm1","type":"resource_not_found_error"}}


In [28]:
# 你的API密钥
api_key = "sk-gNToXJPO4Ih8NDZnDloSXR780dQDEZ2Epu49wcVA1sWb4Ecw"


# 读取文本文件内容
def load_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


# 创建缓存
def create_cache(file_content):
    data = {
        "model": "moonshot-v1",
        "messages": [
            {
                "role": "system",
                "content": file_content
            }
        ],
        "name": "example_cache",
        "ttl": 3600  # 缓存有效期，单位为秒
    }
    response = requests.post(
        url="https://api.moonshot.cn/v1/caching",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        },
        json=data
    )
    cache_response = json.loads(response.text)
    return cache_response['id']


# 检查缓存是否存在且未过期
def check_cache(cache_id):
    response = requests.get(
        url=f"https://api.moonshot.cn/v1/caching/{cache_id}",
        headers={
            "Authorization": f"Bearer {api_key}"
        }
    )
    if response.status_code == 200:
        return True
    return False


# 重新加载数据并更新缓存
def reload_and_update_cache(file_path, cache_id):
    new_file_content = load_file_content(file_path)
    data = {
        "model": "moonshot-v1",
        "messages": [
            {
                "role": "system",
                "content": new_file_content
            }
        ],
        "name": "example_cache",
        "ttl": 3600  # 缓存有效期，单位为秒
    }
    response = requests.put(
        url=f"https://api.moonshot.cn/v1/caching/{cache_id}",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        },
        json=data
    )
    return response.status_code == 200


# 使用缓存内容并添加问题
def use_cache_with_question(cache_id, question):
    data = {
        "model": "moonshot-v1-32k",
        "messages": [
            {
                "role": "cache",
                "content": f"cache_id={cache_id};reset_ttl=3600",
            },
            {
                "role": "user",
                "content": question
            }
        ],
        "max_tokens": 8192,
    }
    response = requests.post(
        url="https://api.moonshot.cn/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        },
        json=data
    )
    return response.json()['choices'][0]['message']['content']

## 处理数据去除相同的规则

In [29]:
with open('../ft_data/RJUA/RJUA_train_v1.json', 'r', encoding='utf-8') as file:
    answer = json.load(file)

rules = []
for i in answer:
    rules.append(i['input'])

unique_rules = list(set(rules))
print(len(unique_rules))
print(unique_rules[0])

70
肾错构瘤临床表现：1.绝大多数错构瘤患者没有明显的症状。2.一些比较大的错构瘤，因为压迫十二指肠、胃等器官而出现消化道的不适症状。3.当较大体积的错构瘤突然破裂时，患者会出现腰腹疼痛和血尿等症状，严重的大出血患者可以在腹部触及到包块，甚至有休克症状。4.肾外表现：面部蝶形分布的皮脂腺瘤、癫痫、智力减退等。


## Main

In [30]:
# 主流程
file_path = 'stanford_alpaca/prompts.txt'
file_content = load_file_content(file_path)
print(f"File{file_path} success read")
cache_id = create_cache(file_content)
print(f"cache id:{cache_id}")
print("cache cuccess")

# # 模拟缓存过期
# time.sleep(3600)

# 检查缓存是否存在且未过期
if not check_cache(cache_id):
    print("检查不到缓存")
    # 重新加载数据并更新缓存
    if reload_and_update_cache(file_path, cache_id):
        print("缓存更新成功")
    else:
        print("缓存更新失败")

print("缓存已存在")

answer = unique_rules

for idx, value in tqdm(enumerate(answer), total=len(answer), desc="Processing"):
    # for num, context in enumerate(answer[idx]["reference"]):
    # 使用缓存内容并添加问题
    context = value
    question = f"请回答关于文件内容的问题，其中[[CONTEXT]]代表的数据为{context}"
    response = use_cache_with_question(cache_id, question)
    judgement = {
        "id": idx,
        "rule": value,
        "generatrion": response,
    }
    # 打开文件以进行写入，如果文件不存在，会创建文件
    with jsonlines.open('stanford_alpaca/RJUA_all_v1.jsonl', mode='a') as writer:
        writer.write(judgement)


Filestanford_alpaca/prompts.txt success read
cache id:cache-f16xkiuoc6di11gkmxq1
cache cuccess
缓存已存在


Processing: 100%|██████████| 70/70 [23:47<00:00, 20.40s/it]


## 对生成结果进行格式调整

In [31]:
import random


def extract_json_objects(text):
    """提取所有合法JSON对象"""
    # 使用正则表达式匹配最外层大括号结构（处理换行和空格）
    pattern = r'\{[^{}]*\}'
    matches = re.findall(pattern, text, re.DOTALL)
    
    results = []
    for match in matches:
        try:
            # 清理可能的尾随逗号（处理类似 }, { 的情况）
            cleaned = match.strip().rstrip(',')
            # 转换为Python字典验证格式
            data = json.loads(cleaned)
            results.append(data)
        except json.JSONDecodeError as e:
            print(f"发现无效JSON片段：{match[:50]}... （错误信息：{e}）")
    return results


instructions = []
with open('stanford_alpaca/RJUA_all_v1.jsonl', 'r', encoding='utf-8') as file:
    data = jsonlines.Reader(file)
    for idx, item in enumerate(data):
        rule = item['rule']
        generation = item['generatrion']
        g = f'[{generation}]'
        try:
            g = extract_json_objects(g)
        except json.JSONDecodeError:
            print(idx)
            continue
        for i in g:
            instructions.append({
                "instruction": i['instruction'],
                "input": rule,
                "output": i['output']
            })

# 设置随机种子（可选，用于保证结果可复现）
random.seed(42)

# 随机打乱列表
random.shuffle(instructions)

# 划分比例（例如 80% 作为训练集，20% 作为测试集）
train_ratio = 0.8
train_size = int(len(instructions) * train_ratio)

# 划分训练集和测试集
train_set = instructions[:train_size]
test_set = instructions[train_size:]

with open('stanford_alpaca/RJUA_stanford_alpaca_train_v1.json', 'w', encoding='utf-8') as file:
    json.dump(train_set, file, ensure_ascii=False, indent=4)

with open('stanford_alpaca/RJUA_stanford_alpaca_test_v1.json', 'w', encoding='utf-8') as file:
    json.dump(test_set, file, ensure_ascii=False, indent=4)

In [None]:
s = "{\n    \"instruction\": \"解释海啸橙色警报意味着什么？\",\n    \"output\": \"海啸橙色警报表示预计海啸波幅将在1至3米之间，需要采取紧急防范措施。\"\n}, {\n    \"instruction\": \"如果海啸橙色警报发布，我们应该怎么做？\",\n    \"output\": \"应迅速撤离至高地或安全区域，避免前往沿海地带。\"\n}, {\n    \"instruction\": \"海啸橙色警报是由什么引起的？\",\n    \"output\": \"海啸橙色警报通常由地震或其他因素引起。\"\n}, {\n    \"instruction\": \"海啸橙色警报的波幅范围是多少？\",\n    \"output\": \"海啸橙色警报的波幅范围是1米到3米。\"\n}, {\n    \"instruction\": \"海啸橙色警报发布后，哪些区域会受到影响？\",\n    \"output\": \"海啸橙色警报发布后，我国沿岸区域会受到影响。\"\n}, {\n"


import re
import json

def extract_json_objects(text):
    """提取所有合法JSON对象"""
    # 使用正则表达式匹配最外层大括号结构（处理换行和空格）
    pattern = r'\{[^{}]*\}'
    matches = re.findall(pattern, text, re.DOTALL)
    
    results = []
    for match in matches:
        try:
            # 清理可能的尾随逗号（处理类似 }, { 的情况）
            cleaned = match.strip().rstrip(',')
            # 转换为Python字典验证格式
            data = json.loads(cleaned)
            results.append(data)
        except json.JSONDecodeError as e:
            print(f"发现无效JSON片段：{match[:50]}... （错误信息：{e}）")
    return results

# 示例数据（模拟包含多个JSON对象的字符串）
input_str = s

# 执行提取
extracted = extract_json_objects(input_str)

# 打印结果
print(f"共找到{len(extracted)}个有效对象：")
for i, item in enumerate(extracted, 1):
    print(f"对象{i}: {json.dumps(item, ensure_ascii=False)}")

共找到5个有效对象：
对象1: {"instruction": "解释海啸橙色警报意味着什么？", "output": "海啸橙色警报表示预计海啸波幅将在1至3米之间，需要采取紧急防范措施。"}
对象2: {"instruction": "如果海啸橙色警报发布，我们应该怎么做？", "output": "应迅速撤离至高地或安全区域，避免前往沿海地带。"}
对象3: {"instruction": "海啸橙色警报是由什么引起的？", "output": "海啸橙色警报通常由地震或其他因素引起。"}
对象4: {"instruction": "海啸橙色警报的波幅范围是多少？", "output": "海啸橙色警报的波幅范围是1米到3米。"}
对象5: {"instruction": "海啸橙色警报发布后，哪些区域会受到影响？", "output": "海啸橙色警报发布后，我国沿岸区域会受到影响。"}


JSONDecodeError: Extra data: line 4 column 2 (char 91)