In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
!uv pip list | grep -E 'torch|triton|vllm|unsloth|transformers|xformers|accelerate|trl'

[2mUsing Python 3.12.11 environment at: /home/abangubuntu/MinesweeperGPT/.venv[0m
accelerate                               1.7.0
fastrlock                                0.8.3
torch                                    2.6.0+cu124
torchao                                  0.12.0
torchaudio                               2.6.0+cu124
torchvision                              0.21.0+cu124
transformers                             4.51.3
triton                                   3.2.0
trl                                      0.18.1
unsloth                                  2025.6.2
unsloth-zoo                              2025.6.1
vllm                                     0.8.5.post1
xformers                                 0.0.29.post2


In [3]:
import torch
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-24 16:39:48 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 08-24 16:39:48 [__init__.py:239] Automatically detected platform cuda.


In [4]:
max_seq_length = 512  # Can increase for longer reasoning traces
lora_rank = 32         # Larger rank = smarter, but slower

# Load model + tokenizer with vLLM acceleration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B",
    max_seq_length = max_seq_length,
    load_in_4bit = True,       # False for LoRA 16bit
    fast_inference = True,      # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.75, # Reduce if out of memory
)

==((====))==  Unsloth 2025.6.2: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Quadro T1000 with Max-Q Design. Num GPUs = 1. Max memory: 4.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen3-1.7b-unsloth-bnb-4bit with actual GPU utilization = 59.82%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 4.0 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 0.93 GB. Also swap space = 0 GB.
INFO 08-24 16:40:02 [config.py:717] This model supports multiple tasks: {'embed', 'generate', 'reward', 'score', 'classify'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-24 16:40:06 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 08-24 16:40:07 [model_runner.py:1140] Model loading took 1.3985 GiB and 3.251033 seconds
INFO 08-24 16:40:15 [worker.py:287] Memory profiling takes 8.39 seconds
INFO 08-24 16:40:15 [worker.py:287] the current vLLM instance can use total_gpu_memory (4.00GiB) x gpu_memory_utilization (0.60) = 2.39GiB
INFO 08-24 16:40:15 [worker.py:287] model weights take 1.40GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 0.70GiB; the rest of the memory reserved for KV Cache is 0.28GiB.
INFO 08-24 16:40:16 [executor_base.py:112] # cuda blocks: 161, # CPU blocks: 0
INFO 08-24 16:40:16 [executor_base.py:117] Maximum concurrency for 512 tokens per request: 5.03x
INFO 08-24 16:40:16 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-m

Capturing CUDA graph shapes:   0%|          | 0/19 [00:00<?, ?it/s]

INFO 08-24 16:40:57 [model_runner.py:1592] Graph capturing finished in 47 secs, took 0.39 GiB
INFO 08-24 16:40:57 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 50.72 seconds
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'pre_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'pre_feedforward_layernorm']


In [5]:
class NoThinkTokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __getattr__(self, name):
        return getattr(self.tokenizer, name)
    
    def __call__(self, *args, **kwargs):
        return self.tokenizer(*args, **kwargs)

    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=True, **kwargs):
        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=tokenize,
            add_generation_prompt=add_generation_prompt,
            enable_thinking=False,   # force disable thinking
            **kwargs
        )

no_think_tokenizer = NoThinkTokenizerWrapper(tokenizer)

In [6]:
from src.finetuning.prompt import SYSTEM_PROMPT
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": """
Here is the current minesweeper board:

Row 1: * * 2 F H
Row 2: * * 3 H H
Row 3: * * 2 H H
Row 4: 1 1 1 H H
Row 5: H H H H H
"""}
]

# Apply chat template, enabling thinking mode
text = no_think_tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

# Tokenize input
inputs = no_think_tokenizer([text], return_tensors="pt").to(model.device)
print("Number of tokens:", len(inputs["input_ids"][0]))


# Generate
generated_ids = model.generate(
    **inputs,
    max_new_tokens=512,
)

# Extract only the new tokens
output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist()
decoded_output = no_think_tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
print("output:", decoded_output)

# # Extract solution block
# import re
# match = re.search(r"<SOLUTION>(.*?)</SOLUTION>", decoded_output, re.DOTALL)
# if match:
#     move = match.group(1).strip()
# else:
#     move = decoded_output  # fallback if tags missing

# print("final move:", move)



Number of tokens: 373
output: row: 1, col: 1, action: reveal


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Remove QKVO if out of memory
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",  # Enable long context finetuning
    random_state=3407,
)

Unsloth 2025.6.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [8]:
# load dataset
from src.finetuning.dataset import MinesweeperDatasetLoader
from datasets import Dataset
from pprint import pprint

dataset_loader = MinesweeperDatasetLoader()
dataset: Dataset = dataset_loader.to_hf_dataset()
for i in range(5):
    pprint(dataset[i])

Using data directory: /home/abangubuntu/MinesweeperGPT/data
Data directory exists: /home/abangubuntu/MinesweeperGPT/data
Found 23 game directories in /home/abangubuntu/MinesweeperGPT/data
{'board_state': [['0', '0', '0', '2', 'F'],
                 ['0', '0', '1', '3', 'F'],
                 ['0', '0', '1', 'F', '3'],
                 ['2', '3', '3', '4', '*'],
                 ['*', '*', '*', '*', '*'],
                 []],
 'hidden_state': [['0', '0', '0', '2', 'M'],
                  ['0', '0', '1', '3', 'M'],
                  ['0', '0', '1', 'M', '3'],
                  ['2', '3', '3', '4', 'M'],
                  ['M', 'M', 'M', '3', 'M']],
 'prompt': [{'content': 'You are a Minesweeper assistant.\n'
                        'The game board is always 5x5 in size.\n'
                        'You will be given ONLY the current board state as '
                        'input from the user.\n'
                        '\n'
                        'Your task: Suggest exactly ONE valid 

In [9]:
from tqdm import tqdm

def count_tokens(example):
    # Turn messages into text using your chat template
    text = tokenizer.apply_chat_template(
        example["prompt"],  # or however your dataset stores conversations
        tokenize=False,
        add_generation_prompt=True,
    )
    # Tokenize and return number of tokens
    return len(tokenizer(text)["input_ids"])

# Collect lengths for the whole dataset
lengths = [count_tokens(example) for example in tqdm(dataset)]

print("Max prompt length:", max(lengths))
print("Avg prompt length:", sum(lengths) / len(lengths))
print("Some samples:", lengths[:10])


100%|██████████| 222/222 [00:00<00:00, 836.85it/s]

Max prompt length: 352
Avg prompt length: 343.6081081081081
Some samples: [344, 341, 337, 340, 344, 346, 342, 347, 347, 338]





In [10]:
from src.finetuning.rewards import (
    reward_format_correct,
    reward_valid_cell,
    reward_correct_move
)

In [11]:
class NoThinkTokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __getattr__(self, name):
        return getattr(self.tokenizer, name)
    
    def __call__(self, *args, **kwargs):
        return self.tokenizer(*args, **kwargs)

    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=True, **kwargs):
        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=tokenize,
            add_generation_prompt=add_generation_prompt,
            enable_thinking=False,   # force disable thinking
            **kwargs
        )

In [12]:
from trl import GRPOConfig, GRPOTrainer

max_prompt_length = 420

training_args = GRPOConfig(
    learning_rate=5e-6,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    logging_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,  # Increase to 4 for smoother training
    num_generations=3,  # Decrease if out of memory
    max_prompt_length=max_prompt_length,
    max_completion_length=max_seq_length - max_prompt_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps=100,
    save_steps=100,
    max_grad_norm=0.1,
    report_to="none",  # Can use Weights & Biases
    output_dir="outputs",
)

trainer = GRPOTrainer(
    model=model,
    processing_class=NoThinkTokenizerWrapper(tokenizer),
    reward_funcs=[
        reward_format_correct,
        reward_valid_cell,
        reward_correct_move
    ],
    args=training_args,
    train_dataset=dataset,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 3


In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 222 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 3 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (3 x 1 x 1) = 3
 "-____-"     Trainable parameters = 34,865,152/1,700,000,000 (2.05% trained)


******************** Reward 3 Debugging Info (every 5 steps) ********************
Prompt:
F 1 0 0 0
1 1 0 0 0
0 0 0 1 1
2 2 2 2 F
F F 2 F *


Responses:
$row: 1, col: 1, action: reveal



Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / reward_format_correct / mean,rewards / reward_format_correct / std,rewards / reward_valid_cell / mean,rewards / reward_valid_cell / std,rewards / reward_correct_move / mean,rewards / reward_correct_move / std
1,0.0,0.0,0.0,14.0,14.0,14.0,0.0,14.0,14.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
model.save_lora("grpo_saved_lora_2")

In [None]:
from vllm import SamplingParams

text = tokenizer.apply_chat_template(
    [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": """
0 0 0 0 0 0
2 2 1 0 1 1
F F 1 0 2 F
3 3 2 0 3 F
1 F 2 1 4 F
* * * * * *
"""},
    ],
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False,
)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=512,
)
output = (
    model.fast_generate(
        text,
        sampling_params=sampling_params,
        lora_request=model.load_lora("grpo_saved_lora"),
    )[0]
    .outputs[0]
    .text
)

print(output)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

<think>
Okay, let me look at the board given. It's a 6x6 grid. The first row is all zeros except the second column, which has a 2. The second row is 2 2 1 0 1 1. Then F F 1 0 2 F. The third row is 3 3 2 0 3 F, and so on. The last row is all stars.

I need to find a valid next move. The cells that are empty are the ones marked with *. Let's check each cell. 

Looking at row 1, columns 1 to 6. All are zeros except maybe the second column, which is 2. Wait
