In [1]:
!pip install -q transformers datasets accelerate bitsandbytes peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [3]:
# Configuration
model_name = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
dataset_name = "nvidia/OpenMathReasoning"
max_length = 512  # Reduced sequence length
use_4bit = True  # Enable 4-bit quantization

In [4]:
# Quantization setup
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [7]:
# Load model with quantization
# Check if CUDA is available before attempting 4-bit quantization
if use_4bit and not torch.cuda.is_available():
    print("CUDA is not available. Cannot use 4-bit quantization.")
    # Fallback to loading the model without quantization, or handle as needed
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto" # Keep device_map if it helps with multi-CPU threading or future GPU use
    )
elif use_4bit:
    # Load model with quantization if CUDA is available and 4-bit is requested
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )
else:
    # Load model without quantization if 4-bit is not requested
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto"
    )

# Only prepare for kbit training if quantization was applied
if use_4bit and torch.cuda.is_available():
    model = prepare_model_for_kbit_training(model)

Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


CUDA is not available. Cannot use 4-bit quantization.


model.safetensors.index.json:   0%|          | 0.00/33.3k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.77G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Reduced target modules
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

In [9]:
# Load small dataset subset
try:
    dataset = load_dataset(dataset_name, split='train[:1000]')  # Small subset
except Exception as e:
    print(f"Dataset error: {e}")
    # Fallback to dummy data
    from datasets import Dataset
    dataset = Dataset.from_dict({
        "question": ["Solve: 2+2="],
        "answer": ["4"]
    })

README.md:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/144 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/144 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/144 [00:00<?, ?files/s]

cot-00000-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00001-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00002-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00003-of-00144.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

cot-00004-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00005-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00006-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00007-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00008-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00009-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00010-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00011-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00012-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00013-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00014-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00015-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00016-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00017-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00018-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00019-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00020-of-00144.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

cot-00021-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00022-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00023-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00024-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00025-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00026-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00027-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00028-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00029-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00030-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00031-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00032-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00033-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00034-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00035-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00036-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00037-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00038-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00039-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00040-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00041-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00042-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00043-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00044-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00045-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00046-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00047-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00048-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00049-of-00144.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

cot-00050-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00051-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00052-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00053-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00054-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00055-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00056-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00057-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00058-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00059-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00060-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00061-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00062-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00063-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00064-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00065-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00066-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00067-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00068-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00069-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00070-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00071-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00072-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00073-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00074-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00075-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00076-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00077-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00078-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00079-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00080-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00081-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00082-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00083-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00084-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00085-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00086-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00087-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00088-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00089-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00090-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00091-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00092-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00093-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00094-of-00144.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

cot-00095-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00096-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00097-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00098-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00099-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00100-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00101-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00102-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00103-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00104-of-00144.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

cot-00105-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00106-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00107-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00108-of-00144.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

cot-00109-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00110-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00111-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00112-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00113-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00114-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00115-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00116-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00117-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00118-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00119-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00120-of-00144.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

cot-00121-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00122-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00123-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00124-of-00144.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

cot-00125-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00126-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00127-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00128-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00129-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00130-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00131-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00132-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00133-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00134-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00135-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00136-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00137-of-00144.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

cot-00138-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00139-of-00144.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

cot-00140-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00141-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00142-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

cot-00143-of-00144.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/72 [00:00<?, ?files/s]

tir-00000-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00001-of-00072.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

tir-00002-of-00072.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

tir-00003-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00004-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00005-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00006-of-00072.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

tir-00007-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00008-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00009-of-00072.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

tir-00010-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00011-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00012-of-00072.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

tir-00013-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00014-of-00072.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

tir-00015-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00016-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00017-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00018-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00019-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00020-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00021-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00022-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00023-of-00072.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

tir-00024-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00025-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00026-of-00072.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

tir-00027-of-00072.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

tir-00028-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00029-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00030-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00031-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00032-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00033-of-00072.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

tir-00034-of-00072.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

tir-00035-of-00072.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

tir-00036-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00037-of-00072.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

tir-00038-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00039-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00040-of-00072.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

tir-00041-of-00072.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

tir-00042-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00043-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00044-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00045-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00046-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00047-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00048-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00049-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00050-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00051-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00052-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00053-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00054-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00055-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00056-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00057-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00058-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00059-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00060-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00061-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00062-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00063-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00064-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00065-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00066-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00067-of-00072.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

tir-00068-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00069-of-00072.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

tir-00070-of-00072.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

tir-00071-of-00072.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

genselect-00000-of-00014.parquet:   0%|          | 0.00/172M [00:00<?, ?B/s]

genselect-00001-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00002-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00003-of-00014.parquet:   0%|          | 0.00/172M [00:00<?, ?B/s]

genselect-00004-of-00014.parquet:   0%|          | 0.00/174M [00:00<?, ?B/s]

genselect-00005-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00006-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00007-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00008-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00009-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00010-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00011-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00012-of-00014.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

genselect-00013-of-00014.parquet:   0%|          | 0.00/174M [00:00<?, ?B/s]

(…)ditional_problems-00000-of-00001.parquet:   0%|          | 0.00/26.1M [00:00<?, ?B/s]

Generating cot split:   0%|          | 0/3201061 [00:00<?, ? examples/s]

Generating tir split:   0%|          | 0/1718466 [00:00<?, ? examples/s]

Generating genselect split:   0%|          | 0/565620 [00:00<?, ? examples/s]

Generating additional_problems split:   0%|          | 0/193170 [00:00<?, ? examples/s]

Dataset error: Unknown split "train". Should be one of ['cot', 'tir', 'genselect', 'additional_problems'].


In [16]:
!df -h /

Filesystem      Size  Used Avail Use% Mounted on
overlay         226G  187G   39G  83% /


In [10]:
# Preprocessing function
def preprocess_function(examples):
    texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    tokenized = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

In [19]:
# Process dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
    batch_size=4  # Small batch size
)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,  # Minimal batch size
    gradient_accumulation_steps=4,  # Compensate for small batch size
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=1,
    logging_steps=10,
    report_to="none"  # Disable external logging
)

In [13]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)


ValueError: fp16 mixed precision requires a GPU (not 'xla').

In [None]:
# Start training
trainer.train()