In [None]:
import os
import sys

sys.path.insert(0, "/diancpfs/user/guobin/verl/recipe/optimal_samplin/diancpfs/user/guobin/verl/recipe/optimal_sampling/optimal_sampling_package")

In [None]:
from optimal_sampling import OptimalSamplingV1


sampler = OptimalSamplingV1(
    model_teacher="Qwen/Qwen3-1.7B",  # Oracle
    model_theta="Qwen/Qwen3-1.7B",      # Student

    teacher_system_prompt=(
        r"You are an analytical thinker who breaks down problems methodically. "
        r"You have insight into effective solution strategies. "
        r"Demonstrate clear thinking that naturally reaches the correct answer, and put your final answer within \boxed{}"
    ),

    theta_system_prompt=r"Please reason step by step, and put your final answer within \boxed{}",
    alpha_method="kl_symmetry",

    track_alpha_stats=False,
    enable_chat_template=True,
    gpu_memory_utilization=0.4,
    max_num_seqs=3,
)


In [None]:
problem1 = r"The operation $\otimes$ is defined for all nonzero numbers by $a \otimes b = \\frac{a^{2}}{b}$. Determine $[(1 \otimes 2) \otimes 3] - [1 \otimes (2 \otimes 3)]$."
oracle_answer1 = r"1. **Apply the operation $\otimes$ to the innermost parentheses first:** \[ (1 \otimes 2) \otimes 3 = \left(\frac{1^2}{2}\right) \otimes 3 = \frac{1}{2} \otimes 3 \] \[ 1 \otimes (2 \otimes 3) = 1 \otimes \left(\frac{2^2}{3}\right) = 1 \otimes \frac{4}{3} \]  2. **Calculate each part using the definition of $\otimes$:** \[ \frac{1}{2} \otimes 3 = \frac{\left(\frac{1}{2}\right)^2}{3} = \frac{\frac{1}{4}}{3} = \frac{1}{12} \] \[ 1 \otimes \frac{4}{3} = \frac{1^2}{\frac{4}{3}} = \frac{1}{\frac{4}{3}} = \frac{3}{4} \]  3. **Subtract the two results:** \[ \left(\frac{1}{12}\right) - \left(\frac{3}{4}\right) = \frac{1}{12} - \frac{9}{12} = -\frac{8}{12} = -\frac{2}{3} \]  4. **Conclude with the final answer:** \[ \boxed{-\frac{2}{3}} \]"

problem2 = r"Doug constructs a square window using $8$ equal-size panes of glass. The ratio of the height to width for each pane is $5 : 2$, and the borders around and between the panes are $2$ inches wide. In inches, what is the side length of the square window?"
oracle_answer2 = r"1. **Identify the dimensions of each pane**: Given that the ratio of the height to the width of each pane is $5:2$, let the height of each pane be $5x$ inches and the width be $2x$ inches.  2. **Calculate the total dimensions of the window**: The window is constructed with $8$ panes arranged in $2$ rows and $4$ columns. The borders between and around the panes are $2$ inches wide.  3. **Calculate the total width of the window**: - There are $4$ panes in each row, each with a width of $2x$ inches. - There are $3$ borders of $2$ inches each between the $4$ panes and $2$ borders of $2$ inches each on the sides of the window. - Therefore, the total width of the window is $4(2x) + 5(2) = 8x + 10$ inches.  4. **Calculate the total height of the window**: - There are $2$ panes in each column, each with a height of $5x$ inches. - There is $1$ border of $2$ inches between the $2$ panes and $2$ borders of $2$ inches each at the top and bottom of the window. - Therefore, the total height of the window is $2(5x) + 3(2) = 10x + 6$ inches."

# sampler.theta_system_prompt=(
#     "Given the problem and correct answer, "
#     "generate detailed reasoning steps."
# ),
#
# sampler.teacher_system_prompt="You are a math problem solver."

# Teacher 看到答案（条件生成）
teacher_prompt1 = f"Problem: {problem1}\nAnswer: {oracle_answer1}\n"
teacher_prompt2 = f"Problem: {problem2}\nAnswer: {oracle_answer2}\n"


# Student 不能看答案（学习直接推理）
student_prompt1 = f"Problem: {problem1}"
student_prompt2 = f"Problem: {problem2}"

# 生成：Optimal mixing 平衡质量和 on-policy
outputs = sampler.generate(
    prompts=[teacher_prompt1, teacher_prompt2],        # Teacher 接收
    theta_prompts=[student_prompt1, student_prompt2],  # Student 接收 ✅
    max_tokens=4096,
    temperature=0.8
)

In [None]:
outputs.generated_texts[0]

In [None]:
outputs.generated_texts[1]

In [None]:
import datasets
data = datasets.load_dataset("agentica-org/DeepScaleR-Preview-Dataset", split="train")

prompts_t, prompts_s = [], []
for idx in range(10):
    prompt_s = data[idx]['problem']
    prompt_t = f"{prompt_s}\n\n##Hint\n{data[idx]['solution']}"

    prompts_s.append(prompt_s)
    prompts_t.append(prompt_t)


In [None]:
outputs = sampler.generate(
    prompts=prompts_t,        # Teacher 接收
    theta_prompts=prompts_s,  # Student 接收 ✅
    max_tokens=1024,
    temperature=0.8
)

In [None]:
for i in range(10):
    print(f"#Index {i}\nQuestion:{prompts_s[i]}\nAnswer:{outputs.generated_texts[i]}\n\n")

In [None]:
sampler.llm.llm_engine.model_config