In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [21]:
!uv pip list | grep -E 'torch|triton|vllm|unsloth|transformers|xformers|accelerate'

[2mUsing Python 3.12.11 environment at: /home/abangubuntu/MinesweeperGPT/.venv[0m
accelerate                               1.7.0
torch                                    2.6.0+cu124
torchao                                  0.12.0
torchaudio                               2.6.0+cu124
torchvision                              0.21.0+cu124
transformers                             4.51.3
triton                                   3.2.0
unsloth                                  2025.5.7
unsloth-zoo                              2025.8.8
vllm                                     0.8.5.post1
xformers                                 0.0.29.post2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
import torch
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-23 19:13:17 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 08-23 19:13:17 [__init__.py:239] Automatically detected platform cuda.


In [3]:
max_seq_length = 1024  # Can increase for longer reasoning traces
lora_rank = 32         # Larger rank = smarter, but slower

# Load model + tokenizer with vLLM acceleration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B",
    max_seq_length = max_seq_length,
    load_in_4bit = True,       # False for LoRA 16bit
    fast_inference = True,      # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

Unsloth: Patching vLLM v1 graph capture
Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.5.7: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Quadro T1000 with Max-Q Design. Num GPUs = 1. Max memory: 4.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen3-1.7b-unsloth-bnb-4bit with actual GPU utilization = 55.83%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 4.0 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 0.77 GB. Also swap space = 0 GB.
INFO 08-23 19:13:35 [config.py:717] This model supports multiple tasks: {'classify', 'reward', 'sc

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-23 19:13:40 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 08-23 19:13:40 [model_runner.py:1140] Model loading took 1.3985 GiB and 3.064188 seconds
INFO 08-23 19:13:54 [worker.py:287] Memory profiling takes 13.24 seconds
INFO 08-23 19:13:54 [worker.py:287] the current vLLM instance can use total_gpu_memory (4.00GiB) x gpu_memory_utilization (0.56) = 2.23GiB
INFO 08-23 19:13:54 [worker.py:287] model weights take 1.40GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 0.70GiB; the rest of the memory reserved for KV Cache is 0.11GiB.
INFO 08-23 19:13:54 [executor_base.py:112] # cuda blocks: 66, # CPU blocks: 0
INFO 08-23 19:13:54 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 1.03x
INFO 08-23 19:13:54 [vllm_utils.py:671] Unsloth: Running patched vLLM v0 `capture_model`.
INFO 08-23 19:13:54 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the mod

Capturing CUDA graph shapes:   0%|          | 0/19 [00:00<?, ?it/s]

INFO 08-23 19:14:28 [model_runner.py:1592] Graph capturing finished in 34 secs, took 0.40 GiB
INFO 08-23 19:14:28 [vllm_utils.py:678] Unsloth: Patched vLLM v0 graph capture finished in 34 secs.
INFO 08-23 19:14:28 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 47.91 seconds
Unsloth: Just some info: will skip parsing ['pre_feedforward_layernorm', 'post_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['pre_feedforward_layernorm', 'post_feedforward_layernorm']


In [7]:
from src.globals import TRAINING_ROWS, TRAINING_COLS
from src.finetuning.prompt import SYSTEM_PROMPT

In [8]:
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
#     {"role": "user", "content": """
# * * * * * *
# * * F * * *
# * * 3 3 4 *
# 1 1 1 0 3 *
# 0 0 0 0 3 *
# 0 0 0 0 2 *
# """},
#     {"role": "assistant", "content": "row: 2, col: 4, action: flag"},
#     {"role": "user", "content": """
# 0 1 F 1 2 F
# 0 1 1 1 2 F
# 0 0 0 0 2 2
# 1 2 1 1 1 F
# * * F 2 3 3
# * * * * F F
# """},
#     {"role": "assistant", "content": "row: 6, col: 3, action: reveal"},
    {"role": "user", "content": """
0 0 0 0 0 0
2 2 1 0 1 1
F F 1 0 2 F
3 3 2 0 3 F
1 F 2 1 4 F
* * * * * *
"""}
]

# Apply chat template, enabling thinking mode
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False,
)

# Tokenize input
inputs = tokenizer([text], return_tensors="pt").to(model.device)

# Generate
generated_ids = model.generate(
    **inputs,
    max_new_tokens=512,
)

# Extract only the new tokens
output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist()
decoded_output = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
print("output:", decoded_output)

# # Extract solution block
# import re
# match = re.search(r"<SOLUTION>(.*?)</SOLUTION>", decoded_output, re.DOTALL)
# if match:
#     move = match.group(1).strip()
# else:
#     move = decoded_output  # fallback if tags missing

# print("final move:", move)



output: row: 1, col: 5, action: reveal


In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Remove QKVO if out of memory
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",  # Enable long context finetuning
    random_state=3407,
)

Unsloth 2025.5.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [13]:
# load dataset
from src.finetuning.dataset import MinesweeperDatasetLoader
from datasets import Dataset
from pprint import pprint

dataset_loader = MinesweeperDatasetLoader()
dataset: Dataset = dataset_loader.to_hf_dataset()
for i in range(5):
    pprint(dataset[i])

Using data directory: /home/abangubuntu/MinesweeperGPT/data
Data directory exists: /home/abangubuntu/MinesweeperGPT/data
Found 3 game directories in /home/abangubuntu/MinesweeperGPT/data
{'hidden_state': ['0 1 M 1 2 M',
                  '0 1 1 1 2 M',
                  '0 0 0 0 2 2',
                  '1 2 1 1 1 M',
                  'M 4 M 2 3 3',
                  'M M 2 2 M M'],
 'messages': [{'content': 'You are a Minesweeper assistant.\n'
                          'The game board is always 6x6 in size.\n'
                          'You will be given ONLY the current board state as '
                          'input from the user.\n'
                          '\n'
                          'Your task: Suggest exactly ONE valid next move for '
                          'the minesweeper board given by the user.\n'
                          '\n'
                          'Move format rules (must follow exactly one of these '
                          'two):\n'
                       

In [14]:
from src.finetuning.rewards import (
    reward_format_correct,
    reward_valid_cell,
    reward_correct_move
)

In [23]:
from trl import GRPOConfig, GRPOTrainer

max_prompt_length = 512

training_args = GRPOConfig(
    learning_rate=5e-6,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    logging_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,  # Increase to 4 for smoother training
    num_generations=6,  # Decrease if out of memory
    max_prompt_length=max_prompt_length,
    max_completion_length=max_seq_length - max_prompt_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps=250,
    save_steps=250,
    max_grad_norm=0.1,
    report_to="none",  # Can use Weights & Biases
    output_dir="outputs",
)

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        reward_format_correct,
        reward_valid_cell,
        reward_correct_move
    ],
    args=training_args,
    train_dataset=dataset,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 6


In [24]:
trainer.train()

NameError: name 'is_torch_version' is not defined