In [None]:
%%capture
# !pip install -U transformers
!pip install -U bitsandbytes 
!pip install -U trl

In [2]:
from kaggle_secrets import UserSecretsClient
hf_token = UserSecretsClient().get_secret("HF")

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import os
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_name = "Qwen/Qwen3-4B-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map={"":0},
                                             low_cpu_mem_usage=True,
                                             quantization_config=bnb_config,
                                             token=hf_token)

In [None]:
from peft import LoraConfig


peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=['q_proj', "k_proj","v_proj","up_proj", "down_proj"],
    task_type="CAUSAL_LM",
    bias="none"
)

In [3]:
from datasets import load_dataset
dataset = load_dataset("openai/gsm8k", "main")

In [4]:
def dataset_format(row):
    row['text'] = f"<bos><|im_start|>user\n{row['question']}<|im_end|>\n<|im_start|>model\n{row['answer']}<|im_end|>"
    return row
    
print(dataset_format(dataset['train'][0])['text'])
print()
dataset = dataset.map(dataset_format)

<bos><|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>model
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|im_end|>



In [None]:
from huggingface_hub import login
login()

In [None]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset['train'],
    processing_class = tokenizer,
    peft_config = peft_config,
    args = SFTConfig(
        output_dir=f"{model_name}-gsm8k-dataset",
        max_steps=200,
        logging_steps=20,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        max_seq_length=2048,
        fp16=True,
        report_to="none",
        push_to_hub=True,
        run_name=f"{model_name}-gsm8k",
        dataset_text_field="text"
    )
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
trainer.model.save_pretrained("qwen3-4B-base-gsm8k")
trainer.processing_class.save_pretrained("qwen3-4B-base-gsm8k")

In [None]:
import gc

torch.cuda.empty_cache()
del model, tokenizer

gc.collect()

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("AlyGreo/Qwen3-4B-Base-gsm8k-dataset", token=hf_token)
model = AutoModelForCausalLM.from_pretrained("AlyGreo/Qwen3-4B-Base-gsm8k-dataset",
                                            low_cpu_mem_usage=True,
                                            device_map={"":0},
                                            torch_dtype=torch.bfloat16,
                                            token=hf_token)

2025-06-29 09:27:36.587012: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751189256.613012     729 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751189256.618452     729 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
from transformers import TextStreamer

prompt = f"<|im_end|>user\n{dataset['test'][8]['question']}<|im_end|><|im_start|>model\n"
print(prompt)

streamer = TextStreamer(tokenizer=tokenizer,
                       skip_prompt=True,
                       skip_special_tokens=True,
                       )

ids = tokenizer(prompt, return_tensors="pt").to(model.device)

_ = model.generate(**ids,
                   max_new_tokens=250,
                   streamer=streamer,
                   do_sample=True,
                   top_k=30,
                   temperature=0.7,
                  pad_token_id=tokenizer.eos_token_id)

<|im_end|>user
John drives for 3 hours at a speed of 60 mph and then turns around because he realizes he forgot something very important at home.  He tries to get home in 4 hours but spends the first 2 hours in standstill traffic.  He spends the next half-hour driving at a speed of 30mph, before being able to drive the remaining time of the 4 hours going at 80 mph.  How far is he from home at the end of those 4 hours?<|im_end|><|im_start|>model

First, let's calculate the distance John traveled to the place where he realized he forgot something. He drove for 3 hours at a speed of 60 mph.

Distance = Speed × Time
Distance = 60 mph × 3 hours = 180 miles

So, John is 180 miles away from home when he realizes he forgot something.

Now, let's calculate the distance he traveled to get home in the 4 hours.

He spent the first 2 hours in standstill traffic, so he didn't cover any distance during this time.

Then, he drove for half an hour at a speed of 30 mph.

Distance = Speed × Time
Distance