In [1]:
pip install vllm

Collecting vllm
  Downloading vllm-0.8.3-cp38-abi3-manylinux1_x86_64.whl.metadata (27 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting transformers>=4.51.0 (from vllm)
  Downloading transformers-4.51.1-py3-none-any.whl.metadata (38 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.9 (from vllm)
  Downloading llguidance-0.7.13-cp39-abi3-manylinux_2_

In [None]:
import re
import torch
from dataclasses import dataclass
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM, SamplingParams

# ----------------------------
# Part 1. Define the CoT Decoder
# ----------------------------

@dataclass
class Path:
    reasoning_text: str
    score: float
    answer_span: str
    num_path: int

@dataclass
class DecodingInfo:
    question: str
    paths: List[Path]

class CoTDecoder:
    """
    Implements Chain-of-Thought (CoT) decoding using vLLM.
    It first retrieves the top-k tokens after the prompt, then generates a full answer for each path.
    """
    def __init__(self, model_name: str,
                 device: str = 'cuda',
                 max_new_tokens: int = 100,
                 topk: int = 5,
                 stop: List[str] = ['\n\nQuestion:', 'Question:', 'Q:', '\n\nQ:', '\n\nExercise'],
                 prompt: str = '',
                 pattern: str = r'[a-zA-Z0-9\s]+'):
        self.model = LLM(model=model_name, dtype='float16')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.device = device
        self.max_new_tokens = max_new_tokens
        self.stop = stop
        self.topk = topk
        self.model.llm_engine.model_config.max_logprobs = self.topk + 1
        self.prompt = prompt
        self.pattern = pattern

    def search_cots(self, raw_prompt: str) -> DecodingInfo:
        # Format the prompt in a Q&A style.
        formatted_prompt = self.format_prompt(raw_prompt)
        # Retrieve the top-k tokens as potential starting continuations.
        topk_tokens = self.get_first_topk_tokens(formatted_prompt)
        # Generate full paths for each top-k token.
        prompts = [formatted_prompt + token for token in topk_tokens['decoded']]
        outputs = self.generate_paths(prompts)
        return self.calculate_score(raw_prompt, topk_tokens, outputs)

    @torch.inference_mode()
    def get_first_topk_tokens(self, prompt: str) -> Dict[str, List]:
        sampling_params = SamplingParams(n=1, temperature=0, top_p=1, max_tokens=1, logprobs=self.topk, stop=self.stop)
        outputs = self.model.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].logprobs[0]
        topk_tokens = {'decoded': [], 'probs': [], 'token_id': [], 'logprobs': []}
        for token_id, logprob_obj in outputs.items():
            topk_tokens['logprobs'].append({token_id: logprob_obj})
            topk_tokens['decoded'].append(logprob_obj.decoded_token)
            topk_tokens['probs'].append(logprob_obj.logprob)
            topk_tokens['token_id'].append(token_id)
        topk_tokens['probs'] = torch.exp(torch.tensor(topk_tokens['probs'])).tolist()
        return topk_tokens

    @torch.inference_mode()
    def generate_paths(self, prompts: List[str]) -> Dict[int, Dict]:
        sampling_params = SamplingParams(n=1, temperature=0, top_p=1, max_tokens=self.max_new_tokens, logprobs=2, stop=self.stop)
        return self.model.generate(prompts, sampling_params, use_tqdm=False)

    def format_prompt(self, raw_prompt: str) -> str:
        # Format prompt in a Q&A style.
        return f'Question: {raw_prompt}\nAnswer: {self.prompt}'

    def calculate_score(self, prompt: str, topk_tokens: Dict, outputs: Dict) -> DecodingInfo:
        paths = []
        for k, output in enumerate(outputs):
            reasoning = topk_tokens['decoded'][k] + output.outputs[0].text
            reasoning = reasoning.strip()
            question_similarity = self.calculate_question_similarity(prompt, reasoning)
            encode = self.tokenizer(reasoning, return_offsets_mapping=True)
            answer_span = re.findall(self.pattern, reasoning)
            score = 0
            if len(answer_span):
                answer_span = answer_span[-1]
                last_pattern_span = (reasoning.rfind(answer_span), reasoning.rfind(answer_span) + len(answer_span))
                idx_answer = [i for i, span in enumerate(encode.offset_mapping)
                              if (span[0] >= last_pattern_span[0] and span[1] <= last_pattern_span[1]) or
                              (span[0] <= last_pattern_span[0] and span[1] >= last_pattern_span[1]) or
                              (span[0] <= last_pattern_span[0] and span[1] > last_pattern_span[0])]
                token_id = [encode.input_ids[idx] for idx in idx_answer]
                output.outputs[0].logprobs.insert(0, topk_tokens['logprobs'][k])
                filtered_answer = [output_val for i, output_val in enumerate(output.outputs[0].logprobs) if i in idx_answer]
                sum_answer_span_probs = 0
                for logprob_dict in filtered_answer:
                    logprob_list = list(logprob_dict.items())
                    if len(logprob_list) == 2:
                        prob_diff = (torch.exp(torch.tensor([logprob_list[0][1].logprob])) -
                                     torch.exp(torch.tensor([logprob_list[1][1].logprob]))).item()
                    else:
                        prob_diff = torch.exp(torch.tensor([logprob_list[0][1].logprob])).item()
                    sum_answer_span_probs += prob_diff
                if question_similarity > 0.5:
                    sum_answer_span_probs *= (1 - question_similarity)
                score = 0 if len(filtered_answer) == 0 else sum_answer_span_probs / len(filtered_answer)
                answer_span = self.tokenizer.decode(token_id, skip_special_tokens=True).strip()
            else:
                answer_span = '|<NotFound>|'
            paths.append(Path(reasoning_text=reasoning,
                              score=score,
                              answer_span=answer_span,
                              num_path=k))
        return DecodingInfo(question=prompt, paths=paths)

    def calculate_question_similarity(self, question: str, reasoning: str) -> float:
        question_words = set(question.split())
        reasoning_words = set(reasoning.split())
        common_words = question_words.intersection(reasoning_words)
        similarity = len(common_words) / len(question_words) if question_words else 0
        return similarity

# ----------------------------
# Part 2. Define Baseline Inference (without CoT)
# ----------------------------

def baseline_inference(model, tokenizer, prompt, max_new_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ----------------------------
# Part 3. Load the Models
# ----------------------------

# Change this model name as needed (e.g., point to a fine-tuned or saved model directory)
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Load the baseline model using Hugging Face Transformers.
tokenizer = AutoTokenizer.from_pretrained(model_name)
baseline_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
baseline_model.to("cuda")

# Initialize the CoT decoder (using vLLM).
# You can set an optional trigger prompt via the prompt parameter if needed.
cot_decoder = CoTDecoder(model_name, prompt="")

# ----------------------------
# Part 4. Interactive User Input
# ----------------------------

def main():
    user_question = input("Please enter your question: ").strip()
    if not user_question:
        print("No question provided. Exiting.")
        return

    # Baseline inference using a simple Q&A prompt.
    baseline_prompt = f"Question: {user_question}\nAnswer:"
    print("\nRunning baseline inference...")
    baseline_output = baseline_inference(baseline_model, tokenizer, baseline_prompt, max_new_tokens=150)
    print("\nBaseline Output:")
    print(baseline_output)

    # Inference with CoT decoding
    print("\nRunning Chain-of-Thought (CoT) decoding...")
    cot_result = cot_decoder.search_cots(user_question)

    # Print all CoT paths
    print("\nCoT-decoding Outputs:")
    for path in cot_result.paths:
        print(f"\nPath {path.num_path}:")
        print("Reasoning:")
        print(path.reasoning_text)
        print("\nExtracted Answer:")
        print(path.answer_span)
        print("Score: {:.4f}".format(path.score))

if __name__ == '__main__':
    main()


INFO 04-10 06:24:33 [__init__.py:239] Automatically detected platform cuda.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

INFO 04-10 06:25:09 [config.py:600] This model supports multiple tasks: {'score', 'classify', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 04-10 06:25:09 [llm_engine.py:242] Initializing a V0 LLM engine (v0.8.3) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-10 06:25:14 [loader.py:447] Loading weights took 1.24 seconds
INFO 04-10 06:25:14 [model_runner.py:1146] Model loading took 0.9267 GiB and 1.836430 seconds
INFO 04-10 06:25:17 [worker.py:267] Memory profiling takes 2.02 seconds
INFO 04-10 06:25:17 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 04-10 06:25:17 [worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 10.85GiB.
INFO 04-10 06:25:17 [executor_base.py:112] # cuda blocks: 59271, # CPU blocks: 21845
INFO 04-10 06:25:17 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 28.94x
INFO 04-10 06:25:22 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CL

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:30<00:00,  1.14it/s]

INFO 04-10 06:25:53 [model_runner.py:1598] Graph capturing finished in 31 secs, took 0.15 GiB
INFO 04-10 06:25:53 [llm_engine.py:448] init engine (profile, create kv cache, warmup model) took 38.13 seconds





Please enter your question: I have 3 apples, my dad has 2 more apples than me, how many apples do we have in total?

Running baseline inference...

Baseline Output:
Question: I have 3 apples, my dad has 2 more apples than me, how many apples do we have in total?
Answer: If your dad has 2 more apples than you, then he has 1 + 2 = 3 apples. In total, you and your dad have 3 + 3 = 6 apples.
So the answer is 6.

Running Chain-of-Thought (CoT) decoding...

CoT-decoding Outputs:

Path 0:
Reasoning:
3 + 2 = 5 apples. We have a total of 5 apples.
Therefore, the answer is 5.

Extracted Answer:
the answer is 5
Score: 1.0000

Path 1:
Reasoning:
2 + 3 = 5 apples. We have a total of 5 apples.
Therefore, the answer is 5.

Extracted Answer:
the answer is 5
Score: 1.0000

Path 2:
Reasoning:
1 apple
Explain how we arrive at this answer: To arrive at the answer, we need to add the number of apples my dad has to the number of apples I have. My dad has 2 more apples than me, so we add 2 to the 3 apples I 

In [None]:
pip install deepseek_tokenizer

Collecting deepseek_tokenizer
  Downloading deepseek_tokenizer-0.1.3-py3-none-any.whl.metadata (1.3 kB)
Downloading deepseek_tokenizer-0.1.3-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deepseek_tokenizer
Successfully installed deepseek_tokenizer-0.1.3


In [3]:
import re
import torch
from dataclasses import dataclass
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from vllm import LLM, SamplingParams


# ----------------------------
# Part 1. Define the CoT Decoder
# ----------------------------

@dataclass
class Path:
    reasoning_text: str
    score: float
    answer_span: str
    num_path: int

@dataclass
class DecodingInfo:
    question: str
    paths: List[Path]

class CoTDecoder:
    """
    Implements Chain-of-Thought (CoT) decoding using vLLM.
    It first retrieves the top-k tokens after the prompt, then generates a full answer for each path.
    """
    def __init__(self, model_name: str,
                 device: str = 'cuda',
                 max_new_tokens: int = 100,
                 topk: int = 5,
                 stop: List[str] = ['\n\nQuestion:', 'Question:', 'Q:', '\n\nQ:', '\n\nExercise'],
                 prompt: str = '',
                 pattern: str = r'[a-zA-Z0-9\s]+'):
        self.model = LLM(model=model_name, dtype='float16')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.device = device
        self.max_new_tokens = max_new_tokens
        self.stop = stop
        self.topk = topk
        self.model.llm_engine.model_config.max_logprobs = self.topk + 1
        self.prompt = prompt
        self.pattern = pattern

    def search_cots(self, raw_prompt: str) -> DecodingInfo:
        # Format the prompt in a Q&A style.
        formatted_prompt = self.format_prompt(raw_prompt)
        # Retrieve the top-k tokens as potential starting continuations.
        topk_tokens = self.get_first_topk_tokens(formatted_prompt)
        # Generate full paths for each top-k token.
        prompts = [formatted_prompt + token for token in topk_tokens['decoded']]
        outputs = self.generate_paths(prompts)
        return self.calculate_score(raw_prompt, topk_tokens, outputs)

    @torch.inference_mode()
    def get_first_topk_tokens(self, prompt: str) -> Dict[str, List]:
        sampling_params = SamplingParams(n=1, temperature=0, top_p=1, max_tokens=1, logprobs=self.topk, stop=self.stop)
        outputs = self.model.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].logprobs[0]
        topk_tokens = {'decoded': [], 'probs': [], 'token_id': [], 'logprobs': []}
        for token_id, logprob_obj in outputs.items():
            topk_tokens['logprobs'].append({token_id: logprob_obj})
            topk_tokens['decoded'].append(logprob_obj.decoded_token)
            topk_tokens['probs'].append(logprob_obj.logprob)
            topk_tokens['token_id'].append(token_id)
        topk_tokens['probs'] = torch.exp(torch.tensor(topk_tokens['probs'])).tolist()
        return topk_tokens

    @torch.inference_mode()
    def generate_paths(self, prompts: List[str]) -> Dict[int, Dict]:
        sampling_params = SamplingParams(n=1, temperature=0, top_p=1, max_tokens=self.max_new_tokens, logprobs=2, stop=self.stop)
        return self.model.generate(prompts, sampling_params, use_tqdm=False)

    def format_prompt(self, raw_prompt: str) -> str:
        # Format prompt in a Q&A style.
        return f'Question: {raw_prompt}\nAnswer: {self.prompt}'

    def calculate_score(self, prompt: str, topk_tokens: Dict, outputs: Dict) -> DecodingInfo:
        paths = []
        for k, output in enumerate(outputs):
            reasoning = topk_tokens['decoded'][k] + output.outputs[0].text
            reasoning = reasoning.strip()
            question_similarity = self.calculate_question_similarity(prompt, reasoning)
            encode = self.tokenizer(reasoning, return_offsets_mapping=True)
            answer_span = re.findall(self.pattern, reasoning)
            score = 0
            if len(answer_span):
                answer_span = answer_span[-1]
                last_pattern_span = (reasoning.rfind(answer_span), reasoning.rfind(answer_span) + len(answer_span))
                idx_answer = [i for i, span in enumerate(encode.offset_mapping)
                              if (span[0] >= last_pattern_span[0] and span[1] <= last_pattern_span[1]) or
                              (span[0] <= last_pattern_span[0] and span[1] >= last_pattern_span[1]) or
                              (span[0] <= last_pattern_span[0] and span[1] > last_pattern_span[0])]
                token_id = [encode.input_ids[idx] for idx in idx_answer]
                output.outputs[0].logprobs.insert(0, topk_tokens['logprobs'][k])
                filtered_answer = [output_val for i, output_val in enumerate(output.outputs[0].logprobs) if i in idx_answer]
                sum_answer_span_probs = 0
                for logprob_dict in filtered_answer:
                    logprob_list = list(logprob_dict.items())
                    if len(logprob_list) == 2:
                        prob_diff = (torch.exp(torch.tensor([logprob_list[0][1].logprob])) -
                                     torch.exp(torch.tensor([logprob_list[1][1].logprob]))).item()
                    else:
                        prob_diff = torch.exp(torch.tensor([logprob_list[0][1].logprob])).item()
                    sum_answer_span_probs += prob_diff
                if question_similarity > 0.5:
                    sum_answer_span_probs *= (1 - question_similarity)
                score = 0 if len(filtered_answer) == 0 else sum_answer_span_probs / len(filtered_answer)
                answer_span = self.tokenizer.decode(token_id, skip_special_tokens=True).strip()
            else:
                answer_span = '|<NotFound>|'
            paths.append(Path(reasoning_text=reasoning,
                              score=score,
                              answer_span=answer_span,
                              num_path=k))
        return DecodingInfo(question=prompt, paths=paths)

    def calculate_question_similarity(self, question: str, reasoning: str) -> float:
        question_words = set(question.split())
        reasoning_words = set(reasoning.split())
        common_words = question_words.intersection(reasoning_words)
        similarity = len(common_words) / len(question_words) if question_words else 0
        return similarity

# ----------------------------
# Part 2. Define Baseline Inference (without CoT)
# ----------------------------

def baseline_inference(model, tokenizer, prompt, max_new_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ----------------------------
# Part 3. Load the Models
# ----------------------------

# Change this model name as needed (e.g., point to a fine-tuned or saved model directory)
model_name = "mradermacher/DeepSeek-R1-Distill-Qwen-1.5B-LIMO-GGUF"

# Load the baseline model using Hugging Face Transformers.
#tokenizer = AutoTokenizer.from_pretrained(deepseek_tokenizer)
model = AutoModel.from_pretrained("mradermacher/DeepSeek-R1-Distill-Qwen-1.5B-LIMO-GGUF")
#baseline_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
baseline_model.to("cuda")

# Initialize the CoT decoder (using vLLM).
# You can set an optional trigger prompt via the prompt parameter if needed.
cot_decoder = CoTDecoder(model_name, prompt="")

# ----------------------------
# Part 4. Interactive User Input
# ----------------------------

def main():
    user_question = input("Please enter your question: ").strip()
    if not user_question:
        print("No question provided. Exiting.")
        return

    # Baseline inference using a simple Q&A prompt.
    baseline_prompt = f"Question: {user_question}\nAnswer:"
    print("\nRunning baseline inference...")
    baseline_output = baseline_inference(baseline_model, tokenizer, baseline_prompt, max_new_tokens=150)
    print("\nBaseline Output:")
    print(baseline_output)

    # Inference with CoT decoding
    print("\nRunning Chain-of-Thought (CoT) decoding...")
    cot_result = cot_decoder.search_cots(user_question)

    # Print all CoT paths
    print("\nCoT-decoding Outputs:")
    for path in cot_result.paths:
        print(f"\nPath {path.num_path}:")
        print("Reasoning:")
        print(path.reasoning_text)
        print("\nExtracted Answer:")
        print(path.answer_span)
        print("Score: {:.4f}".format(path.score))

if __name__ == '__main__':
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: mradermacher/DeepSeek-R1-Distill-Qwen-1.5B-LIMO-GGUF does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.