###Fine-Tuning Segment

In [5]:
os.environ["WANDB_DISABLED"] = "true"


In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [6]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # Updated to 32B-Instruct per initial config
dataset_name = "GAIR/LIMO"  # Assuming this is the same dataset

# Load tokenizer and model without flash attention specification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16  # underlying dtype for model weights
)
model = model.to("cuda")

# Load dataset
dataset = load_dataset(dataset_name)

# Tokenization function
def tokenize_function(examples):
    prompts = [
        f"Question: {q}\nReasoning: {s}\nAnswer: {a}"
        for q, s, a in zip(examples["question"], examples["solution"], examples["answer"])
    ]
    return tokenizer(prompts, truncation=True, padding="max_length", max_length=8192)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments updated to remove DeepSpeed configuration
training_args = TrainingArguments(
    output_dir="./qwen2.5_finetuned_limo",
    overwrite_output_dir=True,
    num_train_epochs=15,                     # 15 epochs
    per_device_train_batch_size=1,           # batch size of 1
    per_device_eval_batch_size=1,            # evaluation batch size of 1
    gradient_accumulation_steps=1,           # gradient accumulation steps of 1
    learning_rate=5.0e-6,                    # learning rate 5.0e-6
    lr_scheduler_type="cosine",              # cosine scheduler
    warmup_ratio=0.0,                        # warmup ratio of 0.0
    logging_steps=1,                         # log every step
    save_strategy="epoch",                   # save at the end of each epoch
    ddp_timeout=180000000,                   # ddp timeout setting
    bf16=True,                             # use bf16 precision
    push_to_hub=False
)

# Select datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets.get("validation")  # Verify this split exists

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train
trainer.train()

# Save
model.save_pretrained("./qwen2.5_finetuned_limo")
tokenizer.save_pretrained("./qwen2.5_finetuned_limo")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/652 [00:00<?, ?B/s]

limo.jsonl:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/817 [00:00<?, ? examples/s]

Map:   0%|          | 0/817 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.50 GiB. GPU 0 has a total capacity of 14.74 GiB of which 658.12 MiB is free. Process 8337 has 14.10 GiB memory in use. Of the allocated memory 13.29 GiB is allocated by PyTorch, and 701.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import re
import torch
from dataclasses import dataclass
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from vllm import LLM, SamplingParams
from tqdm import tqdm

# ----------------------------
# Part 1. Define the CoT Decoder
# ----------------------------

@dataclass
class Path:
    reasoning_text: str
    score: float
    answer_span: str
    num_path: int

@dataclass
class DecodingInfo:
    question: str
    paths: List[Path]

class CoTDecoder:
    """
    Implements Chain-of-Thought (CoT) decoding using vLLM.
    It first retrieves the top-k tokens after the prompt, then generates a full answer for each path.
    """
    def __init__(self, model_name: str,
                 device: str = 'cuda',
                 max_new_tokens: int = 100,
                 topk: int = 5,
                 stop: List[str] = ['\n\nQuestion:', 'Question:', 'Q:', '\n\nQ:', '\n\nExercise'],
                 prompt: str = '',
                 pattern: str = r'[a-zA-Z0-9\s]+'):
        self.model = LLM(model=model_name, dtype='float16')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.device = device
        self.max_new_tokens = max_new_tokens
        self.stop = stop
        self.topk = topk
        self.model.llm_engine.model_config.max_logprobs = self.topk + 1
        self.prompt = prompt
        self.pattern = pattern

    def search_cots(self, raw_prompt: str) -> DecodingInfo:
        # Format the prompt in a Q&A style.
        formatted_prompt = self.format_prompt(raw_prompt)
        # Retrieve the top-k tokens as potential starting continuations.
        topk_tokens = self.get_first_topk_tokens(formatted_prompt)
        # Generate full paths for each top-k token.
        prompts = [formatted_prompt + token for token in topk_tokens['decoded']]
        outputs = self.generate_paths(prompts)
        return self.calculate_score(raw_prompt, topk_tokens, outputs)

    @torch.inference_mode()
    def get_first_topk_tokens(self, prompt: str) -> Dict[str, List]:
        sampling_params = SamplingParams(n=1, temperature=0, top_p=1, max_tokens=1, logprobs=self.topk, stop=self.stop)
        outputs = self.model.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].logprobs[0]
        topk_tokens = {'decoded': [], 'probs': [], 'token_id': [], 'logprobs': []}
        for token_id, logprob_obj in outputs.items():
            topk_tokens['logprobs'].append({token_id: logprob_obj})
            topk_tokens['decoded'].append(logprob_obj.decoded_token)
            topk_tokens['probs'].append(logprob_obj.logprob)
            topk_tokens['token_id'].append(token_id)
        topk_tokens['probs'] = torch.exp(torch.tensor(topk_tokens['probs'])).tolist()
        return topk_tokens

    @torch.inference_mode()
    def generate_paths(self, prompts: List[str]) -> Dict[int, Dict]:
        sampling_params = SamplingParams(n=1, temperature=0, top_p=1, max_tokens=self.max_new_tokens, logprobs=2, stop=self.stop)
        return self.model.generate(prompts, sampling_params, use_tqdm=False)

    def format_prompt(self, raw_prompt: str) -> str:
        # Format prompt in English.
        return f'Question: {raw_prompt}\nAnswer: {self.prompt}'

    def calculate_score(self, prompt: str, topk_tokens: Dict, outputs: Dict) -> DecodingInfo:
        paths = []
        for k, output in enumerate(outputs):
            reasoning = topk_tokens['decoded'][k] + output.outputs[0].text
            reasoning = reasoning.strip()
            question_similarity = self.calculate_question_similarity(prompt, reasoning)
            encode = self.tokenizer(reasoning, return_offsets_mapping=True)
            answer_span = re.findall(self.pattern, reasoning)
            score = 0
            if len(answer_span):
                answer_span = answer_span[-1]
                last_pattern_span = (reasoning.rfind(answer_span), reasoning.rfind(answer_span) + len(answer_span))
                idx_answer = [i for i, span in enumerate(encode.offset_mapping)
                              if (span[0] >= last_pattern_span[0] and span[1] <= last_pattern_span[1]) or
                              (span[0] <= last_pattern_span[0] and span[1] >= last_pattern_span[1]) or
                              (span[0] <= last_pattern_span[0] and span[1] > last_pattern_span[0])]
                token_id = [encode.input_ids[idx] for idx in idx_answer]
                output.outputs[0].logprobs.insert(0, topk_tokens['logprobs'][k])
                filtered_answer = [output_val for i, output_val in enumerate(output.outputs[0].logprobs) if i in idx_answer]
                sum_answer_span_probs = 0
                for logprob_dict in filtered_answer:
                    logprob_list = list(logprob_dict.items())
                    if len(logprob_list) == 2:
                        prob_diff = (torch.exp(torch.tensor([logprob_list[0][1].logprob])) -
                                     torch.exp(torch.tensor([logprob_list[1][1].logprob]))).item()
                    else:
                        prob_diff = torch.exp(torch.tensor([logprob_list[0][1].logprob])).item()
                    sum_answer_span_probs += prob_diff
                if question_similarity > 0.5:
                    sum_answer_span_probs *= (1 - question_similarity)
                score = 0 if len(filtered_answer) == 0 else sum_answer_span_probs / len(filtered_answer)
                answer_span = self.tokenizer.decode(token_id, skip_special_tokens=True).strip()
            else:
                answer_span = '|<NotFound>|'
            paths.append(Path(reasoning_text=reasoning,
                              score=score,
                              answer_span=answer_span,
                              num_path=k))
        return DecodingInfo(question=prompt, paths=paths)

    def calculate_question_similarity(self, question: str, reasoning: str) -> float:
        question_words = set(question.split())
        reasoning_words = set(reasoning.split())
        common_words = question_words.intersection(reasoning_words)
        similarity = len(common_words) / len(question_words) if question_words else 0
        return similarity

# ----------------------------
# Part 2. Define Baseline Inference (without CoT)
# ----------------------------

def baseline_inference(model, tokenizer, prompt, max_new_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ----------------------------
# Part 3. Load the Models and AMC23 Test Dataset
# ----------------------------
#### model_name needs to be changed before using to the saved model directory ###

# Define your model name (adjust if using a fine-tuned version)
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# Load the baseline model (using transformers)
tokenizer = AutoTokenizer.from_pretrained(model_name)
baseline_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
baseline_model.to("cuda")

# Initialize the CoT decoder (using vLLM)
cot_decoder = CoTDecoder(model_name, prompt="")  # Set a trigger prompt if desired

# Load the full AMC23 test set (dataset with columns: id, question, answer, url)
amc23_dataset = load_dataset("math-ai/amc23", "default", split="test")

# ----------------------------
# Part 4. Evaluate on the Full AMC23 Test Set
# ----------------------------

results = []
for sample in tqdm(amc23_dataset, desc="Evaluating AMC23"):
    sample_id = sample["id"]
    question = sample["question"]
    ground_truth = sample["answer"]

    # Baseline inference prompt (using a simple Q&A format)
    baseline_prompt = f"Question: {question}\nAnswer:"
    baseline_output = baseline_inference(baseline_model, tokenizer, baseline_prompt, max_new_tokens=150)

    # Inference with CoT-decoding
    cot_result = cot_decoder.search_cots(question)

    # Collect outputs for the sample
    result_entry = {
        "id": sample_id,
        "question": question,
        "ground_truth": ground_truth,
        "baseline_output": baseline_output,
        "cot_paths": [{
            "path_num": path.num_path,
            "reasoning": path.reasoning_text,
            "answer": path.answer_span,
            "score": path.score
        } for path in cot_result.paths]
    }
    results.append(result_entry)

# ----------------------------
# Part 5. Output the Results
# ----------------------------

for res in results:
    print(f"ID: {res['id']}")
    print("Question:", res['question'])
    print("Ground Truth:", res['ground_truth'])
    print("Baseline Output:", res['baseline_output'])
    print("CoT-decoding Outputs:")
    for path in res["cot_paths"]:
        print(f"  Path {path['path_num']}:")
        print("    Reasoning:", path['reasoning'])
        print("    Answer:", path['answer'])
        print("    Score: {:.4f}".format(path['score']))
    print("="*50)
