In [None]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [None]:
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset  # instead of load_dataset
from transformers import DataCollatorWithPadding

# Model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
   # torch_dtype=torch.float16,
    attn_implementation="sdpa",
    device_map="auto"
).to("cuda")
#model.config.sliding_window = None
model.config.use_cache = False

# Step 1: Load Excel file
df = pd.read_excel("/content/output (7).xlsx")  # Update with your actual file path

# Optional: Check required columns
assert {"question", "solution", "answer"}.issubset(df.columns), "Excel file must contain 'question', 'solution', 'answer' columns"

# Step 2: Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Optional: Split into train/validation
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Step 3: Tokenization
from transformers import DataCollatorWithPadding

# Step 3: Tokenization
tokenizer.pad_token = tokenizer.eos_token  # ensure pad_token_id is set
tokenizer.padding_side = "right"

MAX_LEN = 2048

def tokenize_for_chain(examples):
    input_id_batches = []
    label_batches = []

    for q, sol, ans in zip(examples["question"],
                           examples["solution"],
                           examples["answer"]):
        prompt = f"Question: {q}\nReasoning:"
        target = f" {sol} Answer: {ans}"

        # Tokenize prompt with special tokens
        p_ids = tokenizer.encode(prompt, add_special_tokens=True)
        # Tokenize target without special tokens
        t_ids = tokenizer.encode(target, add_special_tokens=False)

        # How many tokens we can allocate to t_ids + eos
        avail_len = MAX_LEN - len(p_ids)
        if avail_len <= 0:
            # prompt alone is too long—truncate the prompt
            p_ids = p_ids[-MAX_LEN:]
            input_ids = p_ids
            labels = [-100] * len(p_ids)
        else:
            # we need at least 1 slot for eos
            t_ids = t_ids[: avail_len - 1]
            # Concat prompt + truncated target + eos
            input_ids = p_ids + t_ids + [tokenizer.eos_token_id]
            # Mask prompt, keep target+eos as labels
            labels   = [-100] * len(p_ids) + t_ids + [tokenizer.eos_token_id]

        input_id_batches.append(input_ids)
        label_batches.append(labels)

    return {
        "input_ids": input_id_batches,
        "labels":   label_batches
    }







# use a collator that respects padding and returns tensors
#data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


#tokenized_train = train_dataset.map(tokenize_function, batched=True)
#tokenized_train = train_dataset.map(tokenize_function, batched=True)
#tokenized_train = tokenized_train.remove_columns(["question", "solution", "answer", "__index__"])
# After you map with tokenize_function, drop the three text columns:
# Now map it over your datasets:
tokenized_train = train_dataset.map(
    tokenize_for_chain,
    batched=True,
    remove_columns=["question","solution","answer"],
    load_from_cache_file=False,     # <<< disable the cache
)
tokenized_eval = eval_dataset.map(
    tokenize_for_chain,
    batched=True,
    remove_columns=["question","solution","answer"],
    load_from_cache_file=False,
)



#tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Map:   0%|          | 0/735 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
import torch

# 1) Base collator just pads inputs+labels with pad_token_id
base_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",
    return_tensors="pt"
)

# 2) Wrap it to fix the labels post-pad
def data_collator(batch):
    batch = base_collator(batch)
    pad = tokenizer.pad_token_id
    # wherever labels == pad, set to -100
    batch["labels"] = torch.where(
        batch["labels"] == pad,
        torch.full_like(batch["labels"], -100),
        batch["labels"],
    )
    return batch


In [None]:
from transformers import TrainerCallback
training_args = TrainingArguments(
    output_dir="./qwen2.5_finetuned_limo",
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    #learning_rate=5.0e-6,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.0,
    warmup_steps=10,
    logging_steps=1,
    save_strategy="epoch",
    ddp_timeout=180000000,
    bf16=False,
    fp16=True,
    push_to_hub=False,
    remove_unused_columns=False,   # ← add it here
    eval_strategy="no",
   # eval_steps=10
)
class DebugMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print(f"\n>>> Step {state.global_step} metrics:", logs)

class InputLoggingCallback(TrainerCallback):
    def __init__(self):
        self.train_iter = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.train_iter = iter(kwargs["train_dataloader"])

    def on_step_begin(self, args, state, control, **kwargs):
        if state.global_step < 5:
            batch = next(self.train_iter)
            print(f"\nStep {state.global_step}:")
            print("Input IDs Shape:", batch["input_ids"].shape)
            print("Labels Shape:", batch["labels"].shape)
            print("Decoded Input:", tokenizer.decode(batch["input_ids"][0].tolist()))
            print("Decoded Labels (ignoring -100):", tokenizer.decode([x for x in batch["labels"][0].tolist() if x != -100]))

# Add callback to trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
   # callbacks=[InputLoggingCallback()]
    callbacks=[DebugMetricsCallback()],
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
#trainer.train()
trainer.train()
# Save final model and tokenizer
model.save_pretrained("./qwen2.5_finetuned_limo")
tokenizer.save_pretrained("./qwen2.5_finetuned_limo")

Step,Training Loss
1,1.0414
2,0.9841
3,0.6521
4,0.8524
5,0.7191
6,1.2538
7,1.0738
8,0.9316
9,1.3749
10,1.2563



>>> Step 1 metrics: {'loss': 1.0414, 'grad_norm': inf, 'learning_rate': 0.0, 'epoch': 0.0013605442176870747}

>>> Step 2 metrics: {'loss': 0.9841, 'grad_norm': 9.294456481933594, 'learning_rate': 0.0, 'epoch': 0.0027210884353741495}

>>> Step 3 metrics: {'loss': 0.6521, 'grad_norm': 6.9885125160217285, 'learning_rate': 5e-06, 'epoch': 0.004081632653061225}

>>> Step 4 metrics: {'loss': 0.8524, 'grad_norm': 9.37149429321289, 'learning_rate': 1e-05, 'epoch': 0.005442176870748299}

>>> Step 5 metrics: {'loss': 0.7191, 'grad_norm': 7.347801208496094, 'learning_rate': 1.5e-05, 'epoch': 0.006802721088435374}

>>> Step 6 metrics: {'loss': 1.2538, 'grad_norm': inf, 'learning_rate': 2e-05, 'epoch': 0.00816326530612245}

>>> Step 7 metrics: {'loss': 1.0738, 'grad_norm': 10.206761360168457, 'learning_rate': 2e-05, 'epoch': 0.009523809523809525}

>>> Step 8 metrics: {'loss': 0.9316, 'grad_norm': 9.388533592224121, 'learning_rate': 2.5e-05, 'epoch': 0.010884353741496598}

>>> Step 9 metrics: {'los

KeyboardInterrupt: 