In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [None]:
import art
from dotenv import load_dotenv
import openai

load_dotenv()


api = art.UnslothAPI(wandb_project="agent-reinforcement-training")
model = await api._get_or_create_model(
    name="yes-or-no-unsloth-001",
    base_model="unsloth/Qwen2.5-14B-Instruct",
)


async def rollout(client: openai.AsyncOpenAI, prompt: str) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.name, max_tokens=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


openai_client = await model.openai_client()
for i in range(await model.get_iteration(), 1_000):
    train_groups = await art.gather_trajectories(
        (
            (rollout(openai_client, prompt) for _ in range(32))
            for prompt in [
                f"{prefix} with {', '.join([f"'{w}'" if use_quotes else w for w in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
                for prefix in ["respond", "just respond"]
                for use_quotes in [True, False]
                for words in [
                    ["yes", "no", "maybe"],
                    ["maybe", "yes", "no"],
                    ["no", "yes", "maybe"],
                    ["yes", "maybe", "no"],
                    ["yes", "no"],
                    ["maybe", "no"],
                    ["no", "maybe"],
                    ["no", "yes"],
                    ["yes", "no"],
                ]
            ]
        ),
        pbar_desc="train",
        stream_chat_completions=8,
    )
    await model.tune(
        train_groups,
        config=art.TuneConfig(
            lr=1e-4, sequence_length=8192, plot_tensors=False, verbosity=2
        ),
    )

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-31 21:49:34 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.18: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-14b-instruct-unsloth-bnb-4bit with actual GPU utilization = 61.44%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.11 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 8192. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up t



INFO 03-31 21:49:43 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 03-31 21:49:47 model_runner.py:1115] Loading model weights took 10.5640 GB
INFO 03-31 21:49:47 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-31 21:49:49 worker.py:267] Memory profiling takes 1.73 seconds
INFO 03-31 21:49:49 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.61) = 48.60GiB
INFO 03-31 21:49:49 worker.py:267] model weights take 10.56GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 1.84GiB; the rest of the memory reserved for KV Cache is 36.04GiB.
INFO 03-31 21:49:50 executor_base.py:111] # cuda blocks: 12302, # CPU blocks: 2048
INFO 03-31 21:49:50 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 24.03x
INFO 03-31 21:49:52 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:31<00:00,  1.38it/s]

INFO 03-31 21:50:23 model_runner.py:1562] Graph capturing finished in 31 secs, took 8.10 GiB
INFO 03-31 21:50:23 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 35.35 seconds



Unsloth 2025.3.18 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


train:   0%|          | 0/1152 [00:00<?, ?it/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Prepared tuning data with 2 sequences of length 8192


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 300,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 137,625,600/14,000,000,000 (0.98% trained)


{'loss': 0.2918, 'grad_norm': 0.2424517124891281, 'learning_rate': 5e-06, 'epoch': 1e-05}
Unsloth: Will smartly offload gradients to save VRAM!
{'loss': 0.0975, 'grad_norm': 0.13497601449489594, 'learning_rate': 5e-06, 'epoch': 2e-05}


train:   0%|          | 0/1152 [00:00<?, ?it/s]

In [1]:
import atexit

def cleanup_function():
    with open("./test.txt", "w") as f:
        f.write("test")
    # Your cleanup code here

atexit.register(cleanup_function)

<function __main__.cleanup_function()>