## P3O Demo (Decoupled Rewards/Targets)

This notebook demonstrates:
- Building an Archetype with a Template
- Broadcasting to a population
- Using P3O with either rewards_provider or targets_provider



In [None]:
from agent_torch.core.llm.archetype import Archetype
from agent_torch.core.llm.mock_llm import MockLLM
import agent_torch.core.llm.template as lm
import agent_torch.populations.astoria as astoria
from agent_torch.optim import P3O

# Minimal Template
class DemoTemplate(lm.Template):
    system_prompt = "Demo P3O Template"
    age = lm.Variable(desc="agent age", learnable=True)
    gender = lm.Variable(desc="agent gender", learnable=True)
    soc_code = lm.Variable(desc="job id", learnable=False)
    def __prompt__(self):
        self.prompt_string = "You are {age} and a {gender}. Job {soc_code}."

# Construct archetype
arch = Archetype(prompt=DemoTemplate(), llm=MockLLM(), n_arch=2)

# Sample pre-broadcast (single-shot)
arch.sample()

# Broadcast to population
arch.broadcast(population=astoria)
_ = arch.sample()
print("Broadcasted and sampled.")


In [None]:
# Rewards provider variant

def rewards_provider(group_keys, group_preds, arch):
    # Simple KPI: reward values near 0.6
    return [1.0 - (y - 0.6)**2 for y in group_preds]

opt = P3O(archetype=arch, rewards_provider=rewards_provider, verbose=True)
for _ in range(2):
    arch.sample()
    opt.step()
    opt.zero_grad()
print("Rewards provider optimization done.")


In [None]:
# Targets provider variant

lookup = {"13-2099.01": 0.72}

def targets_provider(group_keys, arch):
    return [lookup.get(str(k), 0.5) for k in group_keys]

opt = P3O(archetype=arch, targets_provider=targets_provider, reward_fn=lambda y, t: 1.0 - (y - t)**2)
for _ in range(2):
    arch.sample()
    opt.step()
    opt.zero_grad()
print("Targets provider optimization done.")
