In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024
dtype = None

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit", # 20B model using bitsandbytes 4bit quantization
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b", # 20B model using MXFP4 format
    "unsloth/gpt-oss-120b",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gpt-oss-20b",
    dtype = dtype, # None for auto detection
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_path = "/mnt/workspace/model/gpt-oss-20b"
model_path = "/mnt/workspace/model/qwen3-8b"
# gguf_file = "/mnt/workspace/model/gpt-oss-20b-gguf/gpt-oss-20b-Q4_K_M.gguf"

chat_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    load_in_4bit=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [03:32<00:00, 42.45s/it]


In [5]:
question1 = \
"""
Alice and Bob are each holding some integer number of sweets. Alice says to Bob: ‚ÄúIf
we each added the number of sweets we‚Äôre holding to our (positive integer) age, my answer would
be double yours. If we took the product, then my answer would be four times yours.‚Äù Bob replies:
‚ÄúWhy don‚Äôt you give me five of your sweets because then both our sum and product would be equal.‚Äù
What is the product of Alice and Bob‚Äôs ages?
"""

question = \
"""
what is 32 \\times 48? 
"""
# Think step by step  and g
messages = [
    {"role": "system", "content": "You are a good math problem solver. Give the final answer in 'Answer: \\boxed{final_answer}' format."},
    {"role": "user", "content": question}
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    enable_thinking=False,
    add_generation_prompt=True
)

print(prompt)


<|im_start|>system
You are a good math problem solver. Give the final answer in 'Answer: \boxed{final_answer}' format.<|im_end|>
<|im_start|>user

what is 32 \times 48? 
<|im_end|>
<|im_start|>assistant
<think>

</think>




In [3]:
from transformers import TextIteratorStreamer
from threading import Thread
# skip_special_tokens=True ËøáÊª§Êéâ <|endoftext|> Á≠âÁâπÊÆäÂ≠óÁ¨¶
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# 3. Â∞ÅË£ÖÁîüÊàêÂáΩÊï∞
def stream_chat(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Â∞ÜÁîüÊàêËøáÁ®ãÊîæÂÖ•ÂêéÂè∞Á∫øÁ®ã
    generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=4096)
    thread = Thread(target=chat_model.generate, kwargs=generation_kwargs)
    thread.start()

    # Âú®‰∏ªÁ∫øÁ®ã‰∏≠ÂÆûÊó∂ÊâìÂç∞
    print(f"Prompt: {prompt}\nResponse: ", end="")
    for new_text in streamer:
        print(new_text, end="", flush=True)
    
    thread.join()

# 4. ËøêË°å
stream_chat(prompt)

Prompt: <|im_start|>system
You are a good math problem solver. Think step by step  and give the final answer in 'Answer: \boxed{final_answer}' format.<|im_end|>
<|im_start|>user

what is 32 \times 48? 
<|im_end|>
<|im_start|>assistant
<think>

</think>


Response: To solve $ 32 \times 48 $, we can use the standard multiplication algorithm or break it down for easier computation.

### Step-by-step breakdown:

We can use the **distributive property**:

$$
32 \times 48 = 32 \times (50 - 2) = 32 \times 50 - 32 \times 2
$$

Now compute each part:

- $ 32 \times 50 = 1600 $
- $ 32 \times 2 = 64 $

Now subtract:

$$
1600 - 64 = 1536
$$

### Final Answer:
$$
\boxed{1536}
$$

In [7]:
from transformers import TextStreamer
# streamer = TextStreamer(tokenizer, skip_prompt=False)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
# outputs = chat_model.generate(**inputs, streamer=streamer, max_new_tokens=2048)
outputs = chat_model.generate(**inputs, max_new_tokens=2048)
text = tokenizer.decode(outputs[0])

In [11]:
import re
def extract_answer(text):
    # 1. ÁßªÈô§ÊÄùËÄÉËøáÁ®ãÔºåËé∑ÂèñËæìÂá∫ÈÉ®ÂàÜ
    if "</think>" in text:
        _, output = text.split("</think>", 1)
    else:
        output = text

    # 2. ÂåπÈÖç \boxed{ÂÜÖÂÆπ} ‰∏≠ÁöÑÂÜÖÂÆπ
    # ‰ΩøÁî®ÊçïËé∑ÁªÑ () Êù•ÊèêÂèñÊï∞Â≠óÔºåÂπ∂Â§ÑÁêÜÂèØËÉΩÂ≠òÂú®ÁöÑÁ©∫Ê†º
    pattern = r"\\boxed{([^{}]+)}"
    matches = re.findall(pattern, output)
    # 3. ËøîÂõûÊúÄÂêé‰∏Ä‰∏™ÂåπÈÖçÈ°π
    if matches:
        return matches[-1].strip()
    return None

answer = extract_answer(text)
print(answer)

1536


In [3]:
from model.base import KaggleSolver
model_path = "/mnt/workspace/model/qwen3-8b"
max_seq_length = 2048
solver = KaggleSolver(model_path, max_seq_length=max_seq_length, inference_mode=False)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
problem = \
"""
Alice and Bob are each holding some integer number of sweets. Alice says to Bob: ‚ÄúIf
we each added the number of sweets we‚Äôre holding to our (positive integer) age, my answer would
be double yours. If we took the product, then my answer would be four times yours.‚Äù Bob replies:
‚ÄúWhy don‚Äôt you give me five of your sweets because then both our sum and product would be equal.‚Äù
What is the product of Alice and Bob‚Äôs ages?
"""
answer = solver.predict(problem)
print(answer)

Loading model...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.13s/it]


Successfully load model from /mnt/workspace/model/qwen3-8b.
Formatted Prompt: 
<|im_start|>system
You are a good math problem solver. Give the final answer in 'Answer:oxed{final_answer}' format.<|im_end|>
<|im_start|>user


Alice and Bob are each holding some integer number of sweets. Alice says to Bob: ‚ÄúIf
we each added the number of sweets we‚Äôre holding to our (positive integer) age, my answer would
be double yours. If we took the product, then my answer would be four times yours.‚Äù Bob replies:
‚ÄúWhy don‚Äôt you give me five of your sweets because then both our sum and product would be equal.‚Äù
What is the product of Alice and Bob‚Äôs ages?
 
<|im_end|>
<|im_start|>assistant
<think>

</think>



Generate Output: 


We are given a word problem involving Alice and Bob, who each have some integer number of sweets and some integer age. Let's define:

- Let $ a $ be Alice's age.
- Let $ b $ be Bob's age.
- Let $ s_A $ be the number of sweets Alice has.
- Let $ s_B $ be the number 

In [2]:
from unsloth import FastLanguageModel

max_seq_length = 1024
dtype = None
model_path = "/mnt/workspace/model/gpt-oss-20b"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    dtype = dtype,
    max_seq_length = max_seq_length, 
    load_in_4bit = True,  
    full_finetuning = False, 
)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
ü¶• Unsloth Zoo will now patch everything to make training faster!


ImportError: /usr/local/lib/python3.11/site-packages/vllm/_C.abi3.so: undefined symbol: _ZN3c104cuda29c10_cuda_check_implementationEiPKcS2_ib

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import os

import kaggle_evaluation.aimo_3_inference_server
import pandas as pd
import polars as pl


class Model:
    """A dummy model."""

    def __init__(self):
        self._model = None

    def load(self):
        """Simulate model loading."""
        print("Loading model...")
        # Just return a "model" that always answers with 0
        return lambda problem: 0

    def predict(self, problem: str):
        # Employ lazy loading: load model on the first model.predict call
        if self._model is None:
            self._model = self.load()
        return self._model(problem)


model = Model()


# Replace this function with your inference code.
# The function should return a single integer between 0 and 99999, inclusive.
def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # Unpack values
    id_ = id_.item(0)
    problem_text: str = problem.item(0)
    # Make a prediction
    # The model is loaded on the first call
    prediction = model.predict(problem_text)
    return pl.DataFrame({'id': id_, 'answer': prediction})


inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(
    predict
)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # You MUST call this within 15 minutes of the script starting. This is to
    # ensure a "fast fail" in case a bug prevents the inference server from starting.
    # Do anything that might take a long time (like model loading) in the predict
    # function, which has no time limit.
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        ('/kaggle/input/ai-mathematical-olympiad-progress-prize-3/test.csv',)
    )
