In [1]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 4096*2 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

INFO 03-08 06:10:08 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.6: Fast Llama patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit with actual GPU utilization = 59.59%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.1 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 8192. Num Sequences = 226.
Unsloth: vLLM's KV Cache can use up to 40.81 GB. Also swap space = 6 GB.
INFO 03-08 06:10:17 config.py:549] This model supports multiple tasks: {'classify', 'embed', 'reward', 



INFO 03-08 06:10:18 loader.py:1089] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 03-08 06:10:19 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-08 06:10:23 model_runner.py:1115] Loading model weights took 5.5976 GB
INFO 03-08 06:10:23 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-08 06:10:25 worker.py:267] Memory profiling takes 2.03 seconds
INFO 03-08 06:10:25 worker.py:267] the current vLLM instance can use total_gpu_memory (79.10GiB) x gpu_memory_utilization (0.60) = 47.14GiB
INFO 03-08 06:10:25 worker.py:267] model weights take 5.60GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 1.13GiB; the rest of the memory reserved for KV Cache is 40.27GiB.
INFO 03-08 06:10:25 executor_base.py:111] # cuda blocks: 20619, # CPU blocks: 3072
INFO 03-08 06:10:25 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 40.27x
INFO 03-08 06:10:29 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error 

Capturing CUDA graph shapes: 100%|██████████| 32/32 [00:22<00:00,  1.44it/s]

INFO 03-08 06:10:51 model_runner.py:1562] Graph capturing finished in 22 secs, took 0.83 GiB
INFO 03-08 06:10:51 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 28.72 seconds



Unsloth 2025.3.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
import re
from datasets import load_dataset, Dataset
from search_module import search, get_question_answer, get_question_count
from rl_helpers import get_qa_dataset

train_dataset, test_dataset = get_qa_dataset()

<a name="Train"></a>
### Train the model

Now set up GRPO Trainer and all configurations!

In [5]:
import os
os.environ["WANDB_PROJECT"] = "bootstrap-search-rl"

In [6]:
# from UnslothGRPOTrainerTemp import UnslothGRPOConfig, _UnslothGRPOTrainer
import UnslothGRPOTrainerTemp
training_args = UnslothGRPOTrainerTemp.UnslothGRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    use_agentic_generate = True, # use agentic generation
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 1024,
    max_completion_length = 1024,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 101,
    save_steps = 50,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "full_local_training",

)

In [7]:


import rl_helpers
# importlib.reload(rl_helpers)


def agentic_generate(
        prompts:list[str],
        generate_fn,
        max_generations:int=6,
        ):
    return run_agent(generate_fn, tokenizer, prompts, max_generations)
model.agentic_generate = agentic_generate


from vllm import SamplingParams
verifier_sampling_params = SamplingParams(
    temperature = 0.1,
    top_p = 0.95,
    max_tokens = 4096,
)
def verifier_generate_fn(inputs):
    return model.fast_generate(
        inputs,
        sampling_params = verifier_sampling_params,
    )


run_agent = rl_helpers.run_agent
reward_correctness = rl_helpers.build_reward_correctness_fn(verifier_generate_fn, tokenizer,)
reward_formatting = rl_helpers.reward_formatting

import UnslothGRPOTrainerTemp
trainer = UnslothGRPOTrainerTemp.UnslothGRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        reward_correctness,
        reward_formatting,
    ],
    args = training_args,
    train_dataset = train_dataset,
)

In [8]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 608 | Num Epochs = 1 | Total steps = 101
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 1 x 1) = 16
 "-____-"     Trainable parameters = 167,772,160/4,796,452,864 (3.50% trained)


["What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", "What was the cause of the Lunar Module Pilot's dizziness during his postflight physical examination?", 'What was the maximum differential pressure in the ascent stage oxygen tank 2?', 'What was the maximum differential pressure in the ascent stage oxygen tank 2?', 'What

  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 67.80it/s, est. speed input: 9817.59 toks/s, output: 135.64 toks/s]


rewards_per_func: tensor([0.2500, 0.2625], device='cuda:0')


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / reward_correctness,rewards / reward_formatting
1,0.0,0.5125,0.695523,177.375,0.0,0.25,0.2625
2,0.0,0.64375,0.697078,189.3125,0.0,0.25,0.39375
3,0.0005,0.68125,0.840199,215.5,0.012084,0.375,0.30625
4,0.0001,0.49375,0.660192,199.25,0.002776,0.1875,0.30625
5,0.0003,0.51875,0.539054,213.75,0.007726,0.125,0.39375
6,0.0002,0.325,0.495304,185.5625,0.003768,0.0625,0.2625
7,0.0002,0.58125,0.542687,221.1875,0.004105,0.1875,0.39375
8,0.0002,0.83125,0.824206,240.125,0.004447,0.4375,0.39375
9,0.002,0.51875,0.539054,200.875,0.050854,0.125,0.39375
10,0.0015,0.51875,0.517217,175.875,0.036272,0.125,0.39375


['What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the problem with the canned sandwich spreads?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?', 'What was the title of the first supplement for Apollo 7?']


  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 86.07it/s, est. speed input: 14396.31 toks/s, output: 172.21 toks/s]


rewards_per_func: tensor([0.2500, 0.3937], device='cuda:0')
Unsloth: Will smartly offload gradients to save VRAM!
["What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", "What is the probable cause of the problem with the pressure transducer's plating?", 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', 'Where did the spacecraft land?', '

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 87.91it/s, est. speed input: 13433.45 toks/s, output: 175.88 toks/s]


rewards_per_func: tensor([0.3750, 0.3062], device='cuda:0')
['What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the reason for the substitution of the backup Command Module Pilot?', 'What was the objective of television in earth orbit that could not be achieved?', 'What was the objective of television in earth orbit that could not be achieved?', 'What was the objective of television in earth orbit that could not be achieved?', 'What was the objective of television in earth orb

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 16.61it/s, est. speed input: 2705.32 toks/s, output: 100.85 toks/s]


rewards_per_func: tensor([0.1875, 0.3063], device='cuda:0')
['What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'What was the targeted impact point of the discarded S-IVB stage on the lunar surface?', 'Where did the landing of the spacecraft occur?', 'Where did the landing of the spacecraft occur?', 'Where did the landing of the spacecraft occur?', 'Where did the landing of the spacecraft occur?', 'Where did the landing of t

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 79.63it/s, est. speed input: 11626.86 toks/s, output: 159.33 toks/s]


rewards_per_func: tensor([0.1250, 0.3937], device='cuda:0')
['What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the reason for the roll-pitch coupling observed during the passive thermal control mode maneuver?', 'What was the change made to the cryogenic oxygen tank desi

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 77.94it/s, est. speed input: 12951.76 toks/s, output: 155.96 toks/s]


rewards_per_func: tensor([0.0625, 0.2625], device='cuda:0')
['What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', 'What section of the document discusses the performance of the lunar module?', "What caused the attitude error bias in the spacecraft's attitude error loop?", "What caused the attitude error bias in the spacecraft's attitude error loop?", "What caused the attitude error bias in the spacecraft's attitude error loop?", "What caused the attitude error bias in the spacecraft's attitude e

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 16.77it/s, est. speed input: 3267.06 toks/s, output: 96.89 toks/s]


rewards_per_func: tensor([0.1875, 0.3937], device='cuda:0')
['What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What is the consequence of the O-ring entering the chamfer in the breech assembly?', 'What was the cause of the leakage in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the Ascent stage tank shutoff valve?', 'What was the cause of the leakage in the Ascent stage tank shutoff valve?', 'What was the cause 

Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  7.64it/s, est. speed input: 1428.26 toks/s, output: 84.52 toks/s]


rewards_per_func: tensor([0.4375, 0.3937], device='cuda:0')
['What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What was the predicted rupture range for the helium tank on the spacecraft?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte shorts associated with the battery anomaly?', 'What type of corrective action is being taken to prevent electrolyte short

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 77.29it/s, est. speed input: 12679.28 toks/s, output: 154.67 toks/s]


rewards_per_func: tensor([0.1250, 0.3937], device='cuda:0')
["What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", "What modification was made to the Lunar Module 16-mm Camera's lens bumper to prevent future problems?", 'What was the heart rate range of the Command Module Pilot during the entry phase?', 'What was the 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 84.44it/s, est. speed input: 14667.82 toks/s, output: 168.95 toks/s]


rewards_per_func: tensor([0.1250, 0.3937], device='cuda:0')
['What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'What could have caused the current surge experienced by battery 2?', 'How long did it take to charge battery A in the command module?', 'How long did it take to charge battery A in the command module?', 'How long did it take to charge battery A in the command module?', 'How long did it take to charge battery A in the command module?', 'How long did it take to charge battery A in the command module?', 'How long did it take to charge battery A in

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 84.95it/s, est. speed input: 13629.68 toks/s, output: 169.97 toks/s]


rewards_per_func: tensor([0.1250, 0.3062], device='cuda:0')
['What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What factors affect the degree of attenuator crushing in the described scenario?', 'What factors affect the degree of attenuator crushing in the described scenario?', 'What factors affect the degree of attenuator crushing in the described scenario?', 'What factors affect the degree of attenuator crushing in the described scenario?', 'What factors affect the degree of attenuator crushing in the descr

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 77.19it/s, est. speed input: 13670.53 toks/s, output: 159.29 toks/s]


rewards_per_func: tensor([0.3750, 0.4375], device='cuda:0')
["What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", "What was the daily caloric intake of the crew's flight menus?", 'What was the nominal rating of the secondary lithium hydroxide cartridge for usage capacity?', 'What was the nominal rating of the secondary lithium hydroxide cartridge for usage capacity?', 'What was the nominal rating of the secondary lithium hydroxide cartridge for usage capacity?', 'What was the nominal rating of the secondary lithium hydroxide cartridge for usage capacity?', 'What was the nominal rating of the

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 92.33it/s, est. speed input: 13671.01 toks/s, output: 184.73 toks/s]


rewards_per_func: tensor([0.5000, 0.4375], device='cuda:0')
['What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being within design tolerances?', 'What was the condition that would keep the inlet valve in the closed position despite being

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 89.38it/s, est. speed input: 13776.17 toks/s, output: 178.83 toks/s]


rewards_per_func: tensor([0.1875, 0.2188], device='cuda:0')
['What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal control mode?', 'What caused the crew difficulty in establishing acceptable initial conditions for the passive thermal con

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 67.92it/s, est. speed input: 13282.11 toks/s, output: 135.91 toks/s]


rewards_per_func: tensor([0.1250, 0.4812], device='cuda:0')
['What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the speed of the velocity correction made by the lunar module reaction control system?', 'What was the status of the Advanced Technology Satellite on the day of photography?', 'What was the status of the Advanced Technology Satellite on

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 73.91it/s, est. speed input: 11038.77 toks/s, output: 152.53 toks/s]


rewards_per_func: tensor([0.2500, 0.3062], device='cuda:0')
['What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What was the amplitude of the vibrations at the spacecraft during S-II boost?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from Patrick AFB, Florida?', 'What type of aircraft was staged from 

  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 80.87it/s, est. speed input: 10493.32 toks/s, output: 161.81 toks/s]


rewards_per_func: tensor([0.1250, 0.2188], device='cuda:0')
['What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'What was the initial pitch of the spacecraft?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation from the S-IVB, the turnaround maneuver, docking and ejection?', 'How much propellant was used for the initial separation fr

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 74.12it/s, est. speed input: 12631.60 toks/s, output: 148.32 toks/s]


rewards_per_func: tensor([0.0625, 0.3937], device='cuda:0')
['What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes after the Oxygen tank 2 pressure dropped?', 'What caused the performance of fuel cells 1 and 3 to degrade within 3 minutes

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 80.93it/s, est. speed input: 13133.06 toks/s, output: 167.00 toks/s]


rewards_per_func: tensor([0.2500, 0.2625], device='cuda:0')
['What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the structural integrity of the spacecraft during launch and translunar injection?', 'What types of data were used to evaluate the stru

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 69.72it/s, est. speed input: 14359.23 toks/s, output: 143.84 toks/s]


rewards_per_func: tensor([0.0625, 0.4812], device='cuda:0')
['What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the reason for the reaction control isolation valve failure?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potable water tank in the spacecraft?', 'What was the primary function of the potab

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 76.27it/s, est. speed input: 12299.49 toks/s, output: 152.65 toks/s]


rewards_per_func: tensor([0.4375, 0.5250], device='cuda:0')
['How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'How much water was withdrawn from the potable tank after powering down?', 'What was the velocity change produced by the first transearth midcourse correction?', 'What was the velocity change produced by the first transearth midcourse correction?', 'What was the velocity change produced by the first transearth midcourse correction?', 'What was the velocity change produced by the first transearth midcourse corre

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 71.17it/s, est. speed input: 9244.89 toks/s, output: 142.42 toks/s]


rewards_per_func: tensor([0.3125, 0.2188], device='cuda:0')
['What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the reason for the minus 0.5-ft/sec translation maneuver after service module jettison?', 'What was the result of extending the handle from 5/16 to 3/8 inch from the valve locked position?', 'What was the result of extending the 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 74.51it/s, est. speed input: 13245.02 toks/s, output: 149.12 toks/s]


rewards_per_func: tensor([0.3125, 0.4812], device='cuda:0')
['How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'How long did the installed primary lithium hydroxide cartridge last during the mission?', 'What caused the center engine to experience a premature cutoff?', 'What caused the center engine to experience a premature cutoff?', 'What caused the center engine to experience a premature cutoff?', 'What caus

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 91.33it/s, est. speed input: 13677.72 toks/s, output: 182.74 toks/s]


rewards_per_func: tensor([0.2500, 0.3500], device='cuda:0')
["What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", "What was the net integrated dose of the Command Module Pilot's dosimeter?", 'What was the cause of the degradation of fuel cells 1 and 3 after the Oxygen tank 2 pressure dropped?', 'What was the cause of the degradation of fuel cells 1 and 3 after the Oxygen tank 2 pressure dropped?', 'What was the cause of the degradation of fuel cells 1 and 3 after the Oxygen tank 2 pressure dropped?', 'What wa

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 70.79it/s, est. speed input: 11694.09 toks/s, output: 154.91 toks/s]


rewards_per_func: tensor([0.3750, 0.3062], device='cuda:0')
["What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", "What was the time of the lunar module's landing?", 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be reassembled for Apollo 15 and subsequent missions?', 'How will the suit pressure transducers be 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 90.66it/s, est. speed input: 13674.29 toks/s, output: 181.41 toks/s]


rewards_per_func: tensor([0.4375, 0.3500], device='cuda:0')
['What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the result of the postflight inspection of the apex cover jettison system?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing the use of the auxiliary dump nozzle on the Apollo mission?', 'What was the reason for discontinuing th

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 67.35it/s, est. speed input: 14255.08 toks/s, output: 138.96 toks/s]


rewards_per_func: tensor([0.2500, 0.5688], device='cuda:0')
['What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'What is the cause of the deep repetitive transients on the phase modulated downlink carrier?', 'How many helicopters were staged from Patrick AFB, Florida, for the Apollo 13 mission?', 'How many helicopters were staged from Patrick AFB, Florida, for the Apollo 13 mi

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 46.68it/s, est. speed input: 8722.07 toks/s, output: 128.82 toks/s]


rewards_per_func: tensor([0.2500, 0.3500], device='cuda:0')
['What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What year was the document approved by the National Aeronautics and Space Administration?', 'What system was used for the transearth injection maneuver?', 'What system was used for the transearth injection maneuver?', 'What system was used for the transearth injection maneuver?', 'What 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 90.62it/s, est. speed input: 13793.34 toks/s, output: 181.33 toks/s]


rewards_per_func: tensor([0.2500, 0.3937], device='cuda:0')
['Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'Who controlled the spacecraft during the midcourse maneuver?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?', 'What is the purpose of Tables D-I and D-II?']


Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 47.12it/s, est. speed input: 8097.97 toks/s, output: 127.13 toks/s]


rewards_per_func: tensor([0.3125, 0.4375], device='cuda:0')
['What was the difference between the two sets of angles that pointed the antenna boresight axis away from the line of sight to the ground station?', 'What was the difference between the two sets of angles that pointed the antenna boresight axis away from the line of sight to the ground station?', 'What was the difference between the two sets of angles that pointed the antenna boresight axis away from the line of sight to the ground station?', 'What was the difference between the two sets of angles that pointed the antenna boresight axis away from the line of sight to the ground station?', 'What was the difference between the two sets of angles that pointed the antenna boresight axis away from the line of sight to the ground station?', 'What was the difference between the two sets of angles that pointed the antenna boresight axis away from the line of sight to the ground station?', 'What was the difference between the two sets

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 79.99it/s, est. speed input: 13185.01 toks/s, output: 165.05 toks/s]


rewards_per_func: tensor([0.5000, 0.4812], device='cuda:0')
['What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', 'What was the main difficulty in coordinating the training activities for the Apollo 13 crew?', "What was the reason for the Lunar Module Pilot's heart rate increase?", "What was the reason for the Lunar Module Pilot's heart rate increase?", "What was the reason for 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 75.47it/s, est. speed input: 13202.96 toks/s, output: 150.99 toks/s]


rewards_per_func: tensor([0.5000, 0.5250], device='cuda:0')
['How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'How many ampere-hours of energy remained in the lunar module batteries at the time of undocking?', 'What was the accuracy of the position of the command module at landing?', 'What was the accuracy of the position of the command module at 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 75.80it/s, est. speed input: 12010.95 toks/s, output: 151.67 toks/s]


rewards_per_func: tensor([0.3750, 0.4375], device='cuda:0')
['When were the first two balloons launched?', 'When were the first two balloons launched?', 'When were the first two balloons launched?', 'When were the first two balloons launched?', 'When were the first two balloons launched?', 'When were the first two balloons launched?', 'When were the first two balloons launched?', 'When were the first two balloons launched?', 'What was the primary reason for the potable water quantity measurement not being essential for flight safety or mission success?', 'What was the primary reason for the potable water quantity measurement not being essential for flight safety or mission success?', 'What was the primary reason for the potable water quantity measurement not being essential for flight safety or mission success?', 'What was the primary reason for the potable water quantity measurement not being essential for flight safety or mission success?', 'What was the primary reason for the potabl

Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  7.77it/s, est. speed input: 1540.10 toks/s, output: 64.58 toks/s]


rewards_per_func: tensor([0.1875, 0.3937], device='cuda:0')
['What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What system was used to separate the command module from the service module?', 'What was the weight of the spacecraft at lift-off?', 'What was the weight of the spacecraft at lift-off?', 'What was the weight of the spacecraft at lift-off?', 'What was the weight of the spacecraft at lift-off?', 'What was the weight of the spacecraft at lift-off?', 'What was the weight of the s

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 73.71it/s, est. speed input: 13405.66 toks/s, output: 147.51 toks/s]


rewards_per_func: tensor([0.3125, 0.6125], device='cuda:0')
['What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the result of the experiment involving the photographs?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What was the predicted tank condition at the time of descent engine firing for lunar descent?', 'What w

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 74.72it/s, est. speed input: 12803.40 toks/s, output: 149.52 toks/s]


rewards_per_func: tensor([0.3125, 0.5250], device='cuda:0')
['How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'How much electrical energy was available from the lunar module batteries at the time of undocking?', 'What type of flotation equipment was preferable for ease of donning and egress?', 'What type of flotation equipment was pr

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 87.07it/s, est. speed input: 14084.03 toks/s, output: 174.20 toks/s]


rewards_per_func: tensor([0.3125, 0.4375], device='cuda:0')
["What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", "What was the reason for the Lunar Module Pilot's increased heart rate during the entry phase?", 'What was the effect of rotating the fuel cell selector knob?', 'What was the effect of rotating the fuel cell selector knob?', 'What was the effect of rotating th

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 84.68it/s, est. speed input: 14720.11 toks/s, output: 169.43 toks/s]


rewards_per_func: tensor([0.1250, 0.4812], device='cuda:0')
['What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star visibility during the command module alignment?', 'What was the reason for the poor star vi

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 78.87it/s, est. speed input: 13522.69 toks/s, output: 162.73 toks/s]


rewards_per_func: tensor([0.1875, 0.3937], device='cuda:0')
['What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the estimated minimum temperature reached by the inertial measurement unit before power-up?', 'What was the energy release from the S-IVB impact equivalent to?', 'What was the energy release from the S

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 85.26it/s, est. speed input: 14090.27 toks/s, output: 170.59 toks/s]


rewards_per_func: tensor([0.3125, 0.4375], device='cuda:0')
['What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What type of communications were used after translunar injection?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable limit of carbon dioxide buildup in the lunar module atmosphere?', 'What was the allowable lim

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 51.70it/s, est. speed input: 7781.06 toks/s, output: 103.56 toks/s]


rewards_per_func: tensor([0.4375, 0.5688], device='cuda:0')
['What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by a stored charge of 0.04 coulomb at a potential of 4 million volts?', 'What was the electrostatic potential energy provided by 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 69.98it/s, est. speed input: 13559.10 toks/s, output: 140.05 toks/s]


rewards_per_func: tensor([0.3125, 0.4375], device='cuda:0')
['Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'Which Apollo mission report was cancelled?', 'What was the objective of the television in earth orbit that could not be achieved due to cloud cover?', 'What was the objective of the television in earth orbit that could not be achieved due to cloud cover?', 'What was the objective of the television in earth orbit that could not be achieved due to cloud cover?', 'What was the objective of the television in earth orbit that could not be achieved due to cloud cover?', 'What was the objective of the television in earth orbit that could not be achieved due to cloud cover?', 'What was the objective of the television

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 71.30it/s, est. speed input: 12076.92 toks/s, output: 147.15 toks/s]


rewards_per_func: tensor([0.1250, 0.4812], device='cuda:0')
['What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the title of the first supplement for Apollo 10?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the cracking in the window shades?', 'What was the cause of the crac

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 85.11it/s, est. speed input: 14129.35 toks/s, output: 170.29 toks/s]


rewards_per_func: tensor([0.3750, 0.5250], device='cuda:0')
['What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What could have caused the light to illuminate after the battery was reconnected to the bus?', 'What was the maximum oscillation measured during the S-IVB thrust periods?', 'What was the maximum oscillation measured during the S-IVB thrust periods?', 'What was the m

Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  6.45it/s, est. speed input: 1496.46 toks/s, output: 85.04 toks/s]


rewards_per_func: tensor([0.1250, 0.6562], device='cuda:0')
['What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What caused the shorted condition in the temperature switch wires?', 'What was the reason for the lower temperatures in the command module structure?', 'What was the reason for the lower temperatures in the command module structure?', 'What was the reason for the lower temperatures in the command module structure?', 'What was the reason for the lower temperatures in the command module structure?', 'What was the reason for the lower temperatures

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 78.83it/s, est. speed input: 13333.75 toks/s, output: 162.66 toks/s]


rewards_per_func: tensor([0.1250, 0.4812], device='cuda:0')
['What was the normal temperature for engine package temperatures during the peak engine activity period after the oxygen tank incident?', 'What was the normal temperature for engine package temperatures during the peak engine activity period after the oxygen tank incident?', 'What was the normal temperature for engine package temperatures during the peak engine activity period after the oxygen tank incident?', 'What was the normal temperature for engine package temperatures during the peak engine activity period after the oxygen tank incident?', 'What was the normal temperature for engine package temperatures during the peak engine activity period after the oxygen tank incident?', 'What was the normal temperature for engine package temperatures during the peak engine activity period after the oxygen tank incident?', 'What was the normal temperature for engine package temperatures during the peak engine activity period after t

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 79.67it/s, est. speed input: 13300.54 toks/s, output: 164.38 toks/s]


rewards_per_func: tensor([0.3750, 0.3937], device='cuda:0')
['What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the equilibrium potential for a conventional jet aircraft?', 'What was the expected outcome of the telluric current system in relation to the launch effects?', 'What was the expected outcome of the telluric current system in relation to the launch effects?', 'What was the expected outcome of the telluric current system in relation to the launch effects?', 'What was the expected outcome of the telluric current system in relation 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 67.97it/s, est. speed input: 12319.45 toks/s, output: 144.48 toks/s]


rewards_per_func: tensor([0.1250, 0.5250], device='cuda:0')
['What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was proven for the first time 3 days prior to the flight?', 'What was the method used to acquire the sun and moon alignment for the midcourse maneuver?', 'What was the method used to acquire the sun and moon alignment for the midcourse maneuver?', 'What was the method used to acquire the sun and moon alignment for the midcourse maneuver?', 'What was the method used to acquire the sun and moon alignment for the midcourse maneuver?', 'What was the method used to acquir

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 73.47it/s, est. speed input: 15233.20 toks/s, output: 146.99 toks/s]


rewards_per_func: tensor([0.0000, 0.5688], device='cuda:0')
['Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'Why was it difficult to perform a normal platform alignment using a star reference?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nickel content found in the command module hot water port?', 'What was the nick

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 17.04it/s, est. speed input: 3125.91 toks/s, output: 102.38 toks/s]


rewards_per_func: tensor([0.6875, 0.5250], device='cuda:0')
['What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What was the reason for the first abort in the Apollo program?', 'What is one of the modifications being made to the descent batteries to minimize the hazards associated with electrolyte leakage?', 'What is one of the modifications being made to the descent batteries to minimize the hazards associated with electrolyte leakage?', 'What is one of the modifications being made to the descent batteries to minimize the hazards associated with electrolyte leakage?', 'What is one

  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 65.48it/s, est. speed input: 13757.66 toks/s, output: 135.11 toks/s]


rewards_per_func: tensor([0.5000, 0.6125], device='cuda:0')
['What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'What was the purpose of the cycle of the antenna in the auto-reacquisition mode?', 'Who was substituted as the Command Module Pilot for the prime crew counterpart two days before the launch?', 'Who was substituted as the Command Module Pilot for the prime crew counterpart two days before the launch?', 'Who was substituted as the Command Module Pilo

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 66.43it/s, est. speed input: 15070.86 toks/s, output: 132.89 toks/s]


rewards_per_func: tensor([0.0625, 0.7000], device='cuda:0')
['What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the time of the lunar module undocking before entry?', 'What was the suggested alternative for telecasting during dynamic events?', 'What was the suggested alternative for telecasting during dynamic events?', 'What was the suggested alternative for telecasting during dynamic events?', 'What was the suggested alternative for telecasting during dynamic events?', 'What was the suggested alternative for telecasting during dynamic events?', 'What was the suggested alternativ

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 89.37it/s, est. speed input: 14171.25 toks/s, output: 178.81 toks/s]


rewards_per_func: tensor([0.3125, 0.5250], device='cuda:0')
["Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's atmosphere?", "Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's atmosphere?", "Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's atmosphere?", "Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's atmosphere?", "Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's atmosphere?", "Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's atmosphere?", "Where did the lunar module, including the radioisotope thermoelectric fuel capsule, impact after re-entering the Earth's a

Processed prompts: 100%|██████████| 16/16 [00:01<00:00, 14.21it/s, est. speed input: 3031.09 toks/s, output: 97.80 toks/s]


rewards_per_func: tensor([0.3750, 0.6562], device='cuda:0')
['What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'What was the purpose of the S-IVB stage impact on the lunar surface?', 'Who controlled the pitch attitude during the manual descent propulsion maneuver?', 'Who controlled the pitch attitude during the manual descent propulsion maneuver?', 'Who controlled the pitch attitude during the manual descent propulsion maneuver?', 'Who controlled the pitch attitude during the manual descent propulsion maneuver?', 'Who controlled the pitch a

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 69.63it/s, est. speed input: 12070.00 toks/s, output: 148.04 toks/s]


rewards_per_func: tensor([0.3750, 0.6125], device='cuda:0')
['What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the absorbed dose from ionizing radiation?', 'What was the heart rate of the Lunar Module Pilot during the entry phase?', 'What was the heart rate of the Lunar Module Pilot during the entry phase?', 'What was the heart rate of the Lunar Module Pilot during the entry phase?', 'What was the heart rate of the Lunar Module Pilot during the entry phase?', 'What was the heart rate of the Lunar Module Pilot during the entry phase?', 'What was the heart rate of the Lunar Module Pilot during the entry phase?', 'What was the heart rate of the Lunar

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 80.12it/s, est. speed input: 13221.75 toks/s, output: 165.51 toks/s]


rewards_per_func: tensor([0.6250, 0.6562], device='cuda:0')
['What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the cause of the translation change experienced by the vehicle?', 'What was the effect of the current surge on battery 2?', 'What was the effect of the current surge on battery 2?', 'What was the effect of the current surge on battery 2?', 'What was the effect of the current surge on battery 2?', 'What was the effect of the current surge on battery 2?', 'What was the effect of the current surge

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 67.33it/s, est. speed input: 15317.86 toks/s, output: 134.69 toks/s]


rewards_per_func: tensor([0.0000, 0.6125], device='cuda:0')
['How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'How many hours did the Lunar Module Pilot sleep during the second sleep period?', 'What is the estimated electrostatic potential of a Saturn V launch vehicle?', 'What is the estimated electrostatic potential of a Saturn V launch vehicle?', 'What is the estimated electrostatic potential of a Saturn V launch vehicle?', 'What is the estimated electrostatic p

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 85.00it/s, est. speed input: 13483.65 toks/s, output: 170.07 toks/s]


rewards_per_func: tensor([0.3750, 0.4812], device='cuda:0')
['What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the maximum leakage rate through the shutoff valve into the ascent oxygen tank 2?', 'What was the reason for the use of low power transmissions, backup voice, and omnidirectional antennas in the S-band communications?', 'What was the reason for the use of low power trans

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 81.60it/s, est. speed input: 14647.26 toks/s, output: 163.25 toks/s]


rewards_per_func: tensor([0.8125, 0.6562], device='cuda:0')
['What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What contaminants were found in the cabin and suit loop pressure transducers?', 'What was the date of the nominal launch and exit environment test of the AS-102 spacecraft?', 'What was the date of the nominal launch and exit environment test of the AS-102 spacecraft?', 'What was the date of the nominal launch and exit environment test of the AS-102 spacecraft?', 'What 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 68.82it/s, est. speed input: 14056.47 toks/s, output: 137.72 toks/s]


rewards_per_func: tensor([0.3125, 0.5687], device='cuda:0')
['What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the cause of the battery malfunction in the Descent Battery 2?', 'What was the supplemental configuration used to improve the carbon dioxide removal capability in the lunar module suit circuit?', 'What was the supplemental configuration used to improve the carbon dioxide removal capability in the lunar module suit circuit?', 'What was the supplemental configuration used to improve the carbon dioxide r

Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  5.37it/s, est. speed input: 1109.95 toks/s, output: 63.51 toks/s]


rewards_per_func: tensor([0.2500, 0.6562], device='cuda:0')
['What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was the plan for a safe and quick return to earth?', 'What was changed in the timing signal for the roll engines during service module separation?', 'What was changed in the timing signal for the roll engines during service module separation?', 'What was changed in the timing signal for the roll engines during service module separation?', 'What was changed in the timing signal for the roll engines during service module separation?', 'What was changed in the timing signal for the roll engines during service module s

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 65.50it/s, est. speed input: 14051.38 toks/s, output: 131.08 toks/s]


rewards_per_func: tensor([0.4375, 0.6562], device='cuda:0')
['What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'What was the lunar module used for during the return to Earth?', 'How many ships were available for an Atlantic Ocean landing?', 'How many ships were available for an Atlantic Ocean landing?', 'How many ships were available for an Atlantic Ocean landing?', 'How many ships were available for an Atlantic Ocean landing?', 'How many ships were available for an Atlantic Ocean landing?', 'How many ships were available for an Atlantic Ocean landing?', 'How many ships were availa

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 91.68it/s, est. speed input: 13299.04 toks/s, output: 183.43 toks/s]


rewards_per_func: tensor([0.2500, 0.5250], device='cuda:0')
['What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'What was the reason for the unsuccessful attempt at passive thermal control mode at 7:43:02?', 'How much water was used from the lunar module tanks between activation of the sublimator and undocking?', 'How much water was used from the lunar module tanks between act

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 69.28it/s, est. speed input: 15039.53 toks/s, output: 138.61 toks/s]


rewards_per_func: tensor([0.4375, 0.7000], device='cuda:0')
['What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What system failed prior to launch due to fuel leakage into the point sensor module within the tank?', 'What was the reason for the unsuccessful passive thermal control modes at 7:43:02 and 32:21:49?', 'What wa

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 75.20it/s, est. speed input: 14856.91 toks/s, output: 150.44 toks/s]


rewards_per_func: tensor([0.3750, 0.5688], device='cuda:0')
['What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What was the outcome of the pilot describing function experiment (T-029)?', 'What happened to the propellant isolation valves on quad C after the oxygen tank pressure loss?', 'What happened to the propellant isolation valves on quad C after the oxygen tank pressure loss?', 'What happened to the propellant isolation valves on quad C after the oxygen tank pressure loss?', 'What happened to the prop

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 59.73it/s, est. speed input: 10594.79 toks/s, output: 119.54 toks/s]


rewards_per_func: tensor([0.5625, 0.6562], device='cuda:0')
['What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the primary use of water in the lunar module during the abort phase of the mission?', 'What was the purpose of the procedure developed by the crew for the lunar module environmental control system?', 'What was the purpose of the procedure developed by the c

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 75.80it/s, est. speed input: 15075.22 toks/s, output: 151.64 toks/s]


rewards_per_func: tensor([0.0625, 0.6125], device='cuda:0')
['What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the purpose of coating the inside of the battery case with epoxy paint?', 'What was the configuration of the spacecraft power supply before entry?', 'What was the configuration of the spacecraft power supply before entry?', 'What was the configuration of the spacecraft power supply before entry?', 'What was the configuration of the spacecr

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 19.83it/s, est. speed input: 4617.13 toks/s, output: 90.84 toks/s]


rewards_per_func: tensor([0.3750, 0.5250], device='cuda:0')
['What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was the first indication of a problem in cryogenic oxygen tank 2?', 'What was powered up in the command module about 6-1/2 hours before entry?', 'What was powered up in the command module about 6-1/2 hours before entry?', 'What was powered up in the command module about 6-1/2 hours before entry?', 'What was powered up in the command module about 6-1/2 hours before entry?', 'What was powered up in the command mod

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 72.41it/s, est. speed input: 11788.91 toks/s, output: 144.90 toks/s]


rewards_per_func: tensor([0.2500, 0.5250], device='cuda:0')
['What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the launch date of the mission PA-1?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?', 'What was the location of the command module retrieval?']


Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 48.69it/s, est. speed input: 8881.31 toks/s, output: 131.45 toks/s]


rewards_per_func: tensor([0.0625, 0.5688], device='cuda:0')
['What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What is the publication status of the "Reaction Control System Performance" report for the Apollo 8 mission?', 'What was the cause of the series of master

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 83.25it/s, est. speed input: 14044.60 toks/s, output: 166.56 toks/s]


rewards_per_func: tensor([0.4375, 0.5250], device='cuda:0')
['What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the planned preflight usage of fuel for the reaction control system?', 'What was the likely cause of the problem that occurred in the Apollo 12 and 13 systems?', 'What was the likely cause of the problem that occurred in the Apollo 12 and 13 systems?', 'What was the likely cause of the problem that occurred in the Apollo 12 and 13 systems?', 'What was the like

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 61.00it/s, est. speed input: 10645.70 toks/s, output: 122.05 toks/s]


rewards_per_func: tensor([0.1250, 0.5250], device='cuda:0')
['What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was the reason for the immediate abort of the Apollo 13 mission?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as insulation in the S-Il stage to reduce weight?', 'What was used as in

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 89.89it/s, est. speed input: 13844.02 toks/s, output: 179.86 toks/s]


rewards_per_func: tensor([0.5000, 0.4812], device='cuda:0')
['What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What was the reason for the unfavorable spacecraft attitude delayed communications signal lockup?', 'What could have caused the large fluctuations superimposed on the record at site 6?', 'What could have caused the large fluctuatio

Processed prompts: 100%|██████████| 16/16 [00:01<00:00, 12.65it/s, est. speed input: 2639.21 toks/s, output: 85.41 toks/s]


rewards_per_func: tensor([0.4375, 0.5250], device='cuda:0')
['Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'Why was star observation through the command module optics poor?', 'What was the cause of the shorting of the instrumentation power supply in the service propulsion auxiliary propellant gaging system?', 'What was the cause of the shorting of the instrumentation power supply in the service propulsion auxiliary propellant gaging system?', 'What was the cause of the shorting of the instrumentation power supply in the service propulsion auxiliary propellant gagi

Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  6.16it/s, est. speed input: 1212.81 toks/s, output: 64.34 toks/s]


rewards_per_func: tensor([0.7500, 0.6125], device='cuda:0')
['What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What was the likely cause of the anomaly in the helium tank pressure rise rate?', 'What is the most likely candidate for the cause of the problem in the Apollo l3?', 'What is the most likely candidate for the cause of the problem in the Apollo l3?', 'What is the most likely candidate for the cause of the problem in the Apollo l3?', 'What is the most likel

Processed prompts: 100%|██████████| 16/16 [00:03<00:00,  4.84it/s, est. speed input: 1008.77 toks/s, output: 69.07 toks/s]


rewards_per_func: tensor([0.1875, 0.5688], device='cuda:0')
['What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to address the insufficient lithium hydroxide cartridges in the lunar module?', 'What was the alternative solution implemented to 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 81.11it/s, est. speed input: 14670.93 toks/s, output: 162.27 toks/s]


rewards_per_func: tensor([0.3750, 0.5250], device='cuda:0')
['What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the cause of the large fluctuations in the record at site 6?', 'What was the condition of the interior surfaces of the command module during the postrecovery inspection?', 'What was the condition of the interior surfaces of the command module during the postrecovery inspection?', 'What was the condition of the interior surfaces of the command module during the postrecovery inspection?', 'What was the condition of t

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 68.37it/s, est. speed input: 13175.47 toks/s, output: 141.11 toks/s]


rewards_per_func: tensor([0.4375, 0.5687], device='cuda:0')
['How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'How many ampere hours remained in the lunar module batteries at the time of undocking?', 'What was the time of landing for the spacecraft in the South Pacific?', 'What was the time of landing for the spacecraft in the South Pacific?', 'What was the time of landing for the spacecraft in the South Pacific?', 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 83.80it/s, est. speed input: 14504.21 toks/s, output: 167.67 toks/s]


rewards_per_func: tensor([0.5000, 0.6562], device='cuda:0')
['What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the predicted rupture range for the helium bottle pressure?', 'What was the purpose of the lunar module powerup for the final midcourse correction maneuver?', 'What was the purpose of the lunar module powerup for the final midcourse correction maneuver?', 'What was the purpose of the lunar module powerup for the final midcourse correction maneuver?', 'What was the purpose of the lunar module powerup for the final midcours

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 88.61it/s, est. speed input: 13928.09 toks/s, output: 177.28 toks/s]


rewards_per_func: tensor([0.0625, 0.5688], device='cuda:0')
["What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more often than scheduled?", "What was the reason for the flight control team's decision to use the cryogenic tank fans more oft

Processed prompts: 100%|██████████| 16/16 [00:02<00:00,  7.54it/s, est. speed input: 1728.05 toks/s, output: 86.26 toks/s]


rewards_per_func: tensor([0.4375, 0.6562], device='cuda:0')
['What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What was the final calibration value for the Gyro fixed drift in deg/hr?', 'What component was identified as the possible cause of the shift in the scan-limit functions?', 'What component was identified as the possible cause of the shift in the scan-limit functions?', 'What component was identified as the possible cause of the shift in the scan-limit functions?', 'What component was identified as the po

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 74.47it/s, est. speed input: 14644.01 toks/s, output: 149.00 toks/s]


rewards_per_func: tensor([0.0000, 0.5687], device='cuda:0')
['What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the cause of the reaction control isolation valve failure in the spacecraft?', 'What was the purpose of the screening test for future flight tanks?', 'What was the purpose of the screening test for future flight tanks?', 'What was the purpose of the screening test for future flight tanks?', 'What was the 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 65.98it/s, est. speed input: 15821.09 toks/s, output: 132.01 toks/s]


rewards_per_func: tensor([0.4375, 0.6562], device='cuda:0')
['What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 inch from the valve locked position?', 'What was the condition of the valve when the handle was extended from 5/16 to 3/8 in

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 82.71it/s, est. speed input: 14629.15 toks/s, output: 165.47 toks/s]


rewards_per_func: tensor([0.4375, 0.7000], device='cuda:0')
['What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for the Apollo 13 mission being aborted?', 'What was the reason for terminating efforts to install the tunnel hatch?', 'What was the reason for terminating efforts to install the tunnel hatch?', 'What was the reason for terminating efforts to install the tunnel hatch?', 'What was the reason for terminating efforts to install the tunnel hatch?', 'What was the reason for terminating efforts to install the tunnel hatch?', 'What was the reason for terminating efforts to

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 74.94it/s, est. speed input: 15217.33 toks/s, output: 149.92 toks/s]


rewards_per_func: tensor([0.6250, 0.6562], device='cuda:0')
['What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', 'What was jammed between the lock and unlock positions on the postlanding ventilation valve unlock handle?', "What is the term for the acute angle formed at the intersection of

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 70.64it/s, est. speed input: 12082.26 toks/s, output: 141.40 toks/s]


rewards_per_func: tensor([0.2500, 0.5250], device='cuda:0')
['Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'Where was the command module inspected after reaction control system deactivation and pyrotechnic safing?', 'What was the reason for inhibiting all further overboard urine dum

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 51.68it/s, est. speed input: 8393.85 toks/s, output: 103.42 toks/s]


rewards_per_func: tensor([0.3750, 0.6562], device='cuda:0')
['When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'When did the crew training for Apollo 13 commence?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?', 'What is the time range for the Apollo 7 mission reports?']


Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 86.12it/s, est. speed input: 13343.42 toks/s, output: 172.30 toks/s]


rewards_per_func: tensor([0.3125, 0.5250], device='cuda:0')
['Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'Who was exposed to rubella 8 days before the flight?', 'How much propellant was consumed by the reaction control system in the service module?', 'How much propellant was consumed by the reaction control system in the service module?', 'How much propellant was consumed by the reaction control system in the service module?', 'How much propellant was consumed by the reaction control system in the service module?', 'How much propellant was consumed by the reaction control system in the service module?', 'How much propellant was consumed by the 

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 85.26it/s, est. speed input: 13620.35 toks/s, output: 170.58 toks/s]


rewards_per_func: tensor([0.3125, 0.6562], device='cuda:0')
['What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'What was the duration of earth weather photography following translunar injection?', 'Why did the miswired valve pass the functional checks during buildup and checkout?', 'Why did the miswired valve pass the functional checks during buildup and checkout?', 'Why did the miswired valve pass the functional checks during buildup and check

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 72.43it/s, est. speed input: 13991.87 toks/s, output: 144.94 toks/s]


rewards_per_func: tensor([0.5000, 0.7000], device='cuda:0')
['What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What method was used to obtain potable water for the crew?', 'What was the result of using the lunar module platform for the undocking maneuver?', 'What was the result of using the lunar module platform for the undocking maneuver?', 'What was the result of using the lunar module platform for the undocking maneuver?', 'What was the result of using the lunar module platform for the undocking maneuver?', 'What was the result of using the lunar module platform for the undocking maneuver?', 'What was the

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 66.47it/s, est. speed input: 14527.67 toks/s, output: 137.12 toks/s]


rewards_per_func: tensor([0.3750, 0.7000], device='cuda:0')
['What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the duration of the third descent propulsion operation?', 'What was the reason for substituting the Command Module Pilot in the Apollo 13 crew?', 'What was the reason for substituting the Command Module Pilot in the Apollo 13 crew?', 'What was the reason for substituting the Command Module Pilot in the Apollo 13 crew?', 'What was the reason for substituting the Command Module Pilot in the Apollo 13 crew?', 'What was the reason for substituting the C

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 16.02it/s, est. speed input: 2520.36 toks/s, output: 77.10 toks/s]


rewards_per_func: tensor([0.4375, 0.6125], device='cuda:0')
['What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What type of support was provided by the Department of Defense for the Apollo 13 mission?', 'What was the shift in the Z-axis accelerometer bias after the long cold soak?', 'What was the shift in the Z-axis accelerometer bias after the long cold soak?', 'What was the shift in the Z-axis

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 80.05it/s, est. speed input: 15075.20 toks/s, output: 160.15 toks/s]


rewards_per_func: tensor([0.1875, 0.7000], device='cuda:0')
['What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the achieved pericynthion altitude at translunar injection?', 'What was the purpose of the second midcourse correction maneuver?', 'What was the purpose of the second midcourse correction maneuver?', 'What was the purpose of the second midcourse correction maneuver?', 'What was the purpose of the second midcourse correction maneuver?', 'What was the purpose of the second midcourse correction maneuver?', 'What was the purp

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 90.32it/s, est. speed input: 14006.58 toks/s, output: 180.72 toks/s]


rewards_per_func: tensor([0.5625, 0.6562], device='cuda:0')
['What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What type of corrective action is being taken to prevent future sensor problems?', 'What was the reason for the divergent coning angle during the passive thermal control mode attempt at 7:43:02?', 'What was the reason for the divergent coning angle during the passive thermal control mode attempt at 7:43:02?', 'What was the reason for the divergent 

Processed prompts: 100%|██████████| 16/16 [00:01<00:00, 10.52it/s, est. speed input: 2387.37 toks/s, output: 94.15 toks/s]


rewards_per_func: tensor([0.4375, 0.6562], device='cuda:0')
['What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the immediate response of the crew after the loss of oxygen and primary power in the service module?', 'What was the shift in the Z-axis a

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 59.11it/s, est. speed input: 10418.63 toks/s, output: 118.30 toks/s]


rewards_per_func: tensor([0.0625, 0.5250], device='cuda:0')
['What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'What was the purpose of the check of the platform alignment accuracy?', 'How long did the suit compressor operate during entry?', 'How long did the suit compressor operate during entry?', 'How long did the suit compressor operate during entry?', 'How long did the suit compressor operate during entry?', 'How long did the suit compressor operate during entry?', 'How long did the suit compressor operate during entry?', 'How lo

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 71.43it/s, est. speed input: 11737.13 toks/s, output: 151.84 toks/s]


rewards_per_func: tensor([0.6250, 0.5250], device='cuda:0')
['What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the result of the miswiring of the fuel valve closing coil in the reaction control system?', 'What was the throttle profile for the transearth injection maneuver?', 'What was the throttle profile for the tran

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 50.41it/s, est. speed input: 11537.19 toks/s, output: 122.90 toks/s]


rewards_per_func: tensor([0.1875, 0.6562], device='cuda:0')
['What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What was the launch date of the Apollo 4 spacecraft?', 'What caused an unexpected reversal in the lunar module yaw rate during passive thermal control?', 'What caused an unexpected reversal in the lunar module yaw rate during passive thermal control?', 'What caused an unexpected reversal in the lunar module yaw rate during passive thermal control?', 'What caused an unexpected reversal in the lunar module yaw rate during passive thermal control?', 'What caused an unexpected reversal in the lunar module yaw rate during passive thermal control

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 82.18it/s, est. speed input: 14582.86 toks/s, output: 164.42 toks/s]


rewards_per_func: tensor([0.3750, 0.6562], device='cuda:0')
['Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'Where was the command module docked to the lunar module?', 'What was added to the inlet of the cabin fan to reduce the amount of free lunar dust in the cabin?', 'What was added to the inlet of the cabin fan to reduce the amount of free lunar dust in the cabin?', 'What was added to the inlet of the cabin fan to reduce the amount of free lunar dust in the cabin?', 'What was added to the inlet of the cabin fan to reduce the amount of free lunar dust in the cabin?', 'What was added to the inlet of the cabin fan to re

Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 36.11it/s, est. speed input: 6772.70 toks/s, output: 117.70 toks/s]


rewards_per_func: tensor([0.3125, 0.6125], device='cuda:0')
['What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the location where the spacecraft was expected to land?', 'What was the cause of the inflight failure in tank 2?', 'What was the cause of the inflight failure in tank 2?', 'What was the cause of the inflight failure in tank 2?', 'What was the cause of the inflight failure in tank 2?', 'What was the cause of the inflight failure in tank 2?', 'What was the cause of the inflight failure in tank 2?', 'What was the cause of the inflight failure in tank 2

  completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
Processed prompts: 100%|██████████| 16/16 [00:00<00:00, 83.15it/s, est. speed input: 14475.87 toks/s, output: 166.38 toks/s]


rewards_per_func: tensor([0.0625, 0.6562], device='cuda:0')


TrainOutput(global_step=101, training_loss=0.01657797145482646, metrics={'train_runtime': 4469.3474, 'train_samples_per_second': 0.362, 'train_steps_per_second': 0.023, 'total_flos': 0.0, 'train_loss': 0.01657797145482646})

<a name="Inference"></a>
### Inference
Now let's try benchmark the model we trained!

In [9]:
from vllm import SamplingParams
import rl_helpers
sampling_params = SamplingParams(
    temperature = 0.5,
    top_p = 0.95,
    max_tokens = 4096,
)

def eval_generate_fn(inputs):
    return model.fast_generate(
        inputs,
        sampling_params = sampling_params,
        lora_request = model.load_lora("full_local_training/checkpoint-101"), # load the trained LoRA
    )


rl_helpers.run_eval(
    generate_fn=eval_generate_fn,
    verify_fn=reward_correctness,
    tokenizer=tokenizer,
)

Processed prompts: 100%|██████████| 68/68 [00:14<00:00,  4.76it/s, est. speed input: 1376.87 toks/s, output: 608.06 toks/s]
Processed prompts: 100%|██████████| 63/63 [00:14<00:00,  4.39it/s, est. speed input: 3351.46 toks/s, output: 379.62 toks/s]
Processed prompts: 100%|██████████| 15/15 [00:11<00:00,  1.32it/s, est. speed input: 1578.72 toks/s, output: 172.81 toks/s]
Processed prompts: 100%|██████████| 13/13 [00:11<00:00,  1.18it/s, est. speed input: 1944.44 toks/s, output: 132.12 toks/s]
Processed prompts: 100%|██████████| 7/7 [00:11<00:00,  1.60s/it, est. speed input: 1308.80 toks/s, output: 90.55 toks/s]
Processed prompts: 100%|██████████| 68/68 [00:02<00:00, 23.64it/s, est. speed input: 4671.56 toks/s, output: 108.82 toks/s] 

RESULTS:
percentage of correct answers: 0.5294117647058824





[{'messages': [{'role': 'system',
    'content': 'Cutting Knowledge Date: December 2023\nToday Date: 08 Mar 2025\n\nWhen you receive a tool call response, use the output to format an answer to the original user question.\n\nYou are a helpful assistant with tool calling capabilities.\n'},
   {'role': 'user',
    'content': 'You are a research assistant, and you use the search_corpus tool to find answers to questions.\nGiven a question, answer it using by doing searches using the search_corpus tool.\nTo use the search_corpus tool, respond with a JSON for a function call with its proper arguments.\n\nYou may also reason in any message, thinking step by step about how to answer the question. Wrap your reasoning in <reasoning> and </reasoning> tags.\n\n{\n  "type": "function",\n  "function": {\n    "name": "search_corpus",\n    "description": "Search over the knowledge corpus with a given query",\n    "parameters": {\n      "type": "object",\n      "properties": {\n        "query": {\n     

In [13]:
# eval w/o lora
def eval_generate_fn(inputs):
    return model.fast_generate(
        inputs,
        sampling_params = sampling_params,
    )


rl_helpers.run_eval(
    generate_fn=eval_generate_fn,
    verify_fn=reward_correctness,
    tokenizer=tokenizer,
)

Processed prompts: 100%|██████████| 68/68 [00:09<00:00,  7.44it/s, est. speed input: 2153.78 toks/s, output: 904.77 toks/s] 
Processed prompts: 100%|██████████| 36/36 [00:09<00:00,  3.87it/s, est. speed input: 2884.02 toks/s, output: 378.38 toks/s]
Processed prompts: 100%|██████████| 11/11 [00:06<00:00,  1.78it/s, est. speed input: 2231.76 toks/s, output: 206.84 toks/s]
Processed prompts: 100%|██████████| 9/9 [00:06<00:00,  1.38it/s, est. speed input: 2454.10 toks/s, output: 171.91 toks/s]
Processed prompts: 100%|██████████| 6/6 [00:06<00:00,  1.08s/it, est. speed input: 2015.55 toks/s, output: 126.05 toks/s]
Processed prompts: 100%|██████████| 68/68 [00:01<00:00, 45.90it/s, est. speed input: 7927.94 toks/s, output: 147.15 toks/s] 

RESULTS:
percentage of correct answers: 0.22058823529411764





[{'messages': [{'role': 'system',
    'content': 'Cutting Knowledge Date: December 2023\nToday Date: 08 Mar 2025\n\nWhen you receive a tool call response, use the output to format an answer to the original user question.\n\nYou are a helpful assistant with tool calling capabilities.\n'},
   {'role': 'user',
    'content': 'You are a research assistant, and you use the search_corpus tool to find answers to questions.\nGiven a question, answer it using by doing searches using the search_corpus tool.\nTo use the search_corpus tool, respond with a JSON for a function call with its proper arguments.\n\nYou may also reason in any message, thinking step by step about how to answer the question. Wrap your reasoning in <reasoning> and </reasoning> tags.\n\n{\n  "type": "function",\n  "function": {\n    "name": "search_corpus",\n    "description": "Search over the knowledge corpus with a given query",\n    "parameters": {\n      "type": "object",\n      "properties": {\n        "query": {\n     