In [None]:
!pip install openai==0.28 jsonlines datasets ptcompletion

In [None]:
!huggingface-cli login

# 分块

In [None]:
from datasets import load_dataset

def sliding_window_split(text, block_size=4000, overlap=1000):
    """
    将文本分割成滑动窗口块。

    参数:
    text (str): 要分割的文本。
    block_size (int): 每个块的字符数。
    overlap (int): 块之间的重叠字符数。

    返回:
    list: 文本块的列表。
    """
    chunks = []
    start = 0
    while start < len(text):
        # 如果不是第一个块，就移动一部分以保证重叠
        if start > 0:
            start -= overlap
        end = start + block_size
        chunks.append(text[start:end])
        start = end
    return chunks

# 加载或创建数据集
dataset = load_dataset('xxxx')

# 应用滑动窗口分块
chunked_dataset = dataset.map(lambda x: {'text': x['text'],'meta':x['meta'], 'chunks': sliding_window_split(x['text'])})

# 显示分块结果的示例
print(len(chunked_dataset['train'][0]['chunks'][1]))  # 显示第一个文本项的分块结果


In [None]:
chunked_dataset.push_to_hub("", private=True)

In [None]:
#等等等等

# 提示词

In [None]:
promptv6='''Analyze the text block from a mathematics book and extract content according to the following specifications:
1. **Content to Include**: Only include theorems with clear proofs, and clear, explicit examples and exercises.
2. **Problem*: Start with a brief description of the problem context, providing its background, such as "This is a problem about abstract algebra, group theory, abelian groups.". The problem should be as detailed as possible. Each problem must be completely self-contained, providing all the contextual information one needs to understand and solve the problem. Assume common math knowledge but ensure that any specific context variables or lemma, theorems to this problem are explicitly included. If a problem or solution refers to another theorem, use the full content of that theorem, avoiding phrases like 'theorem\lemma\example 3'.
3. **Solution**: If and only if a problem has a clear and complete solution, include it in the solution field. Solutions should be provided in a step-by-step format, like"Step1:...\nStep2:...", ensuring clarity and completeness. The solution should be as comprehensive as possible, containing all the background, details and computation that you can find in the context.
4. **Exclusivity**: Include as much problems as you can find that meet these criteria and exclude any that do not.
5. **Is or isnot Euclid Geometry**: If the problem is a Geometry problem, please output true in the 'isGeometry' field. Otherwise, output false. Remember both Geometry and not Geometry are welcome to output.
6. Must not contain words like 'lemma3.1' or 'example3' or 'theorm2'. The reader won't understand it. Solutions should be provided in a step-by-step format

Example of unexpected output:
{"problem": "Fill in the two similar calculations in the proof of Lemma 1.18.", "isGeometry": false, "solution": "null", "answer": "null"} (The problem refer to a Lemma but don't use the full content of it, you must avoiding phrases like 'theorem\lemma\example 3')
{"problem": "Prove the inversion distance formula.", "isGeometry": false, "solution": "null", "answer": "null"}(The problem refer to a formula but don't explain it in detail)
{"problem": "Problem 1.37 (BAMO 1999/2). Let $O=(0,0), A=(0, a)$, and $B=(0, b)$, where $0<a<b$ are reals. Let $\Gamma$ be a circle with diameter $\overline{A B}$ and let $P$ be any other point on $\Gamma$. Line $P A$ meets the $x$-axis again at $Q$. Prove that $\angle B Q P=\angle B O P$.", "isGeometry": true, "solution": "null", "answer": "null"}(must not contain phrase like Problem 1.37,solution must be step-by-step)

Example of expected output:(please don't repeat it in your answer)
{}
{"problem": "This is a problem about geometry, area of triangle. Prove that $m(A B C)=m(B C A)=m(C A B)$ and $m(A C B)=m(B A C)=m(C B A)$. ", "isGeometry": true, "solution": "null", "answer": "null"}
{"problem": "This is a basic arithmetic problem. If x + 2 = 4, what is x?", "isGeometry": false, "solution": "Step1: Subtracting 2 from both sides of the equation\n Step2: We have x = 2.", "answer": "2"}
{"problem": "This is a problem about number theory, prime numbers. Every even integer greater than 2 can be expressed as the sum of two primes. Proof:", "isGeometry": false, "solution": "null", "answer": "null"}

Return the extracted content in JSONL format, with each entry containing 'problem', 'isGeometry', 'solution', and 'answer' (if applicable). If no content meets these criteria above, return an empty JSONL line."
Remember, You Must not use words like 'lemma3.1' or 'example3' or 'theorem2'. Output as much problems as you can find that meet these criteria.
Text block:
'''


# 并发提取

In [None]:
import openai
openai.api_base = "https://api.deepseek.com/"
d=load_dataset('----',split='train')

class deepseekTask(OpenAITask):

    def validate(self, completion:str):
        '''
        Check if generated completion fits your intend format.
        Return: A bool value.
        '''
        return True

    def postprocess(self, completion:str):
        '''
        Postprocess generated completion.
        Return: Postprocess result in a pytho dict.
        '''
        # completion = completion.replace('LLAMA!', 'Oh my LLAMA!')
        return {'genrated_output': completion}



In [None]:
import random

# 设置随机数的范围和数量
range_start = 0
range_end = 10000
number_of_samples = 1000

# 使用random.sample()函数随机选择1000个不重复的数字
random_numbers = random.sample(range(range_start, range_end + 1), number_of_samples)

print(random_numbers)
messages_list = [
    [
        {"role": "system", "content": "You are a helpful assistant expert in MATH."},
        {"role": "user", "content": generate_prompt(i)},
         ] for i in random_numbers]

tasks = [deepseekTask(id = i,
                    messages=messages,
                    generation_config=generation_config,
                    model=model,
                    api_key=api_key)
                    for i, messages in enumerate(messages_list)][:500]


In [None]:
import json

def append_to_jsonl(file_path, idx, output):
    """Append task output to a JSONL file."""
    data = {"id": random_numbers[idx], "ori_problem":similar[random_numbers[idx]]["problem_ta"],"output": output}
    with open(file_path, 'a') as file:  # 打开文件用于追加内容
        json.dump(data, file)  # 将字典转换为 JSON 字符串并写入文件
        file.write('\n')  # 写入换行符以保持 JSONL 格式

In [None]:
from ptcompletion import TaskQueue
tq = TaskQueue(requests_per_minute=200, max_rounds=3, max_requests_per_proc=16, log_file='tasks_1000.log')
for i in range(12):
  completed_tasks = tq.start(tasks[i*500:(i+1)*500])
  for idx, t in enumerate(completed_tasks):
    append_to_jsonl('/content/drive/MyDrive/A.jsonl', idx+i*500, t.result['genrated_output'])
del completed_tasks