In [1]:
# --- Imports ---

import unsloth
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
import os, json, re, sys, tqdm

device = 'cuda'

# --- GPU check ---
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

"""
PyTorch version: 2.7.0+cu118
CUDA available: True
CUDA version: 11.8
GPU count: 1
Device name: NVIDIA GeForce RTX 4080
"""


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.7.0+cu126 with CUDA 1206 (you have 2.7.0+cu118)
    Python  3.10.11 (you have 3.10.17)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
PyTorch version: 2.7.0+cu118
CUDA available: True
CUDA version: 11.8
GPU count: 1
Device name: NVIDIA GeForce RTX 4080


'\nPyTorch version: 2.7.0+cu118\nCUDA available: True\nCUDA version: 11.8\nGPU count: 1\nDevice name: NVIDIA GeForce RTX 4080\n'

In [None]:
# --- Load the model ---

# https://huggingface.co/unsloth/llama-3-8b-bnb-4bit/blob/main/README.md
model_id = 'unsloth/llama-3-8b-bnb-4bit'

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map='auto',
    trust_remote_code=True,
)
model.eval()

print(f"Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")  # 5.70 GB
print(f"Memory reserved:  {torch.cuda.memory_reserved() / 1e9:.2f} GB")  # 7.36 GB

"""
f:\miniforge3\envs\repurposed-tokens\lib\site-packages\accelerate\utils\modeling.py:808: UserWarning: expandable_segments not supported on this platform (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\c10/cuda/CUDAAllocatorConfig.h:28.)
  _ = torch.tensor([0], device=i)
Memory allocated: 5.72 GB
Memory reserved:  7.36 GB
"""


In [None]:
# --- Load the dataset ---

gsm = load_dataset('gsm8k', 'main', split='test[:30]')

# --- Separator tokens for injection ---

SEPARATORS = [
    "\n",
    ",",
    ".",
    "and",
    "the",
    "about",
    "neuroplasticity",
    "metaphor",
    "paradigm",
    "operator"
]

def inject_separator(prompt: str, sep: str) -> str:
    """Inserts `sep` between every word in the `prompt`"""
    return f' {sep} '.join(prompt.split())


In [None]:
# --- Prompt function ---

SYSTEM = [{
    'role': 'system', 
    'content': (
        'You are a strict math grader.  For each problem return ONLY the final numeric answer'
        ', no units, no explanation, no punctuation.'
    )
}]
FEWSHOT = [
    {'role': 'user', 'content': 'What is 2 + 3?'},
    {'role': 'assistant', 'content': '5'},
]
TERMINATORS = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
tokenizer.chat_template = (
    "<|begin_of_text|>\n"
    "{% for message in messages -%}"
    "<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n\n"
    "{{ message['content'] }}<|eot_id|>\n"
    "{% endfor -%}"
    "{% if add_generation_prompt -%}"
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
    "{% endif -%}"
)

def prepare_messages(user_prompt: str):
    messages = SYSTEM + FEWSHOT + [{'role': 'user', 'content': user_prompt}]
    full_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return full_prompt

def query_model(prompt, model, tokenizer, max_new_tokens=32):
    # Standardize prompt input format
    full_prompt = prepare_messages(prompt)

    # Pass the prompt through the model
    inputs = tokenizer(full_prompt, return_tensors='pt').to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=TERMINATORS
        )
    
    # Decode only newly generated text
    generated = outputs[0][inputs['input_ids'].shape[1]:]  # Only new tokens
    output = tokenizer.decode(generated, skip_special_tokens=True)

    return output.strip()


In [None]:
# --- Sanity checks ---
# The model already has a quantization config:
# print(model.config.quantization_config)
"""
BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}
"""
print(type(model.model.layers[0].self_attn.q_proj))  # bitsandbytes.nn.modules.Linear4bit

# Checking terminating tokens
ids = {s: tokenizer.convert_tokens_to_ids(s) for s in
       ["<|begin_of_text|>", "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"]}
assert all(v != tokenizer.unk_token_id for v in ids.values()), "one of the tags is not in the vocab"
       # No error; we're good

print('Terminators: ', TERMINATORS)  # [128001, 128009]

# Check prompt generation
messages = [
    {"role": "user",   "content": "What is 9x8?"},
    {"role": "user",   "content": "What is 17+5?"}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("\nGenerated prompts:")
print(prompt.strip())
"""
<class 'bitsandbytes.nn.modules.Linear4bit'>
Terminators:  [128001, 128009]
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>

You are a math tutor.<|eot_id|>
<|start_header_id|>user<|end_header_id|>

What is 17+5?<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

# Quick output checks
print("\n\nQuick output checks:")
for q in ["What is 2+2?", gsm[0]["question"]]:
    print(q, "→", query_model(q, model, tokenizer, max_new_tokens=32))



In [None]:
# --- Evaluation function ---

def extract_answer(text):
    # Extracting negative signs and decimals
    numbers = re.findall(r'-?\d+(?:\.\d+)?', text)
    return numbers[-1] if numbers else None
    
def evaluate(model_output, gsm_answer):
    model_answer = extract_answer(model_output)
    gsm_answer = extract_answer(gsm_answer)

    return model_answer == gsm_answer, model_answer, gsm_answer

In [None]:
# Testing
print(query_model("What is 2+2?", model, tokenizer))

for ex in gsm.select(range(5)):
    prompt = ex['question'].strip()
    model_output = query_model(prompt, model, tokenizer)
    print('Prompt: ', prompt)
    print('Output: ', model_output)


NameError: name 'query_model' is not defined

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id  = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model     = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    torch_dtype = torch.bfloat16,
)

# Build prompt with built‑in chat template
msgs = [
    {"role": "system", "content": "You are a minimalist assistant."},
    {"role": "user",   "content": "Give me a haiku about the moon."}
]
prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

# Tokenise & move to GPU
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
out_ids = model.generate(**inputs, max_new_tokens=64, eos_token_id=eot_id)

# Decode only the new tokens (assistant reply)
reply = tokenizer.decode(out_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(reply)


ModuleNotFoundError: No module named 'torch._C'

In [3]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
import torch

tok = AutoTokenizer.from_pretrained("bert-base-uncased")
tok.pad_token = tok.eos_token = "[PAD]"

# Two examples of different length with answer‑only labels
ex0 = {"input_ids": [101, 2000, 999, 102],               # [CLS go ? SEP]
       "labels":    [-100, 999, 102, 102]}
ex1 = {"input_ids": [101, 2023, 2742],              # [CLS this great SEP]
       "labels":    [-100, -100, 2742]}
batch = [ex0, ex1]

def show(name, collator):
    print(f"\n{name}")
    out = collator(batch)
    print("input_ids:\n", out["input_ids"])
    print("labels:\n", out["labels"])

# A) crashes because labels remain ragged
try:
    show("WithPadding  (crashes)", DataCollatorWithPadding(tok, return_tensors="pt"))
except ValueError as e:
    print("ValueError:", e)

# B) OK but labels were replaced
show("LM (mlm=False)  – labels overwritten",
     DataCollatorForLanguageModeling(tok, mlm=False))

# C) OK – pads labels with ‑100, keeps original mask
show("Seq2Seq  – just right",
     DataCollatorForSeq2Seq(tok, label_pad_token_id=-100, padding="longest"))



WithPadding  (crashes)
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

LM (mlm=False)  – labels overwritten


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
#!/usr/bin/env python
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
)

tok = AutoTokenizer.from_pretrained("bert-base-uncased")
pad_id = tok.pad_token_id
tok.pad_token_type_id = -100

# Two unequal-length sequences
ex1 = tok("hello world", add_special_tokens=False)["input_ids"]  # [7592, 2088]
ex2 = tok("hi",           add_special_tokens=False)["input_ids"]  # [763]

# ------------------------------------------------------------
# 1) Collator your notebook currently uses
# ------------------------------------------------------------
features = [{"input_ids": ex1}, {"input_ids": ex2}]
pad_collator = DataCollatorWithPadding(tokenizer=tok, return_tensors="pt")
batch_pad = pad_collator(features)                       # works, pads with pad_id

# ------------------------------------------------------------
# 2) Collator you want for causal-LM fine-tuning
#    (DON’T pass 'labels'; it builds them itself)
# ------------------------------------------------------------
lm_collator = DataCollatorForLanguageModeling(
    tokenizer=tok, mlm=False, return_tensors="pt"
)
batch_lm = lm_collator(features)                         # works, masks with -100

# ------------------------------------------------------------
# 3) Show the result
# ------------------------------------------------------------
print(f"pad_token_id = {pad_id}\n")

print("DataCollatorWithPadding | input_ids")
print(batch_pad["input_ids"], "\n")        # padded positions = pad_id

print("DataCollatorWithPadding | NO labels generated automatically")

print("DataCollatorForLanguageModeling | labels")
print(batch_lm["labels"])                  # padded positions = -100


pad_token_id = 0

DataCollatorWithPadding | input_ids
tensor([[7592, 2088],
        [7632,    0]]) 

DataCollatorWithPadding | NO labels generated automatically
DataCollatorForLanguageModeling | labels
tensor([[7592, 2088],
        [7632, -100]])


In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForLanguageModeling

tok = AutoTokenizer.from_pretrained("bert-base-uncased")
pad_id = tok.pad_token_id
tok.pad_token_type_id = -100  # Has NO effect on labels or input_ids

# Two unequal-length sequences with labels (for padding)
ex1 = {"input_ids": [7592, 2088], "labels": [7592, 2088]}
ex2 = {"input_ids": [763],        "labels": [763]}

features = [ex1, ex2]

# DataCollatorWithPadding
pad_collator = DataCollatorWithPadding(tokenizer=tok, return_tensors="pt")
batch_pad = pad_collator(features)

print("input_ids:")
print(batch_pad["input_ids"])
print("labels:")
print(batch_pad["labels"])
print("token_type_ids (should be padded with -100):")
print(batch_pad.get("token_type_ids"))

# DataCollatorForLanguageModeling
lm_collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False, return_tensors="pt")
batch_lm = lm_collator([{"input_ids": [7592, 2088]}, {"input_ids": [763]}])
print("DataCollatorForLanguageModeling | labels")
print(batch_lm["labels"])


AttributeError: can't set attribute 'pad_token_type_id'

In [9]:
tok.pad_token_type_id

0