<a href="https://colab.research.google.com/github/Erickrus/llm/blob/main/SoftmaxWithTemperature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM decode process - Temperature, Top_P and Top_K

https://www.youtube.com/watch?v=lH9YPeSq6IA

## LLM decode - Softmax with Temperature

In [None]:
#@title softmax(z)

import numpy as np

def softmax(z):
    #print("softmax")
    #@markdown $\sigma(z_i)=\frac{e^{z_i}}{\sum^N_{j=0}e^{z_j}}$
    logits = np.asarray(z)
    exponential_logits = np.exp(logits)
    probs = exponential_logits / np.sum(exponential_logits)
    return probs

In [None]:
#@title softmax_with_temperature(z, temperature)
def softmax_with_temperature(z, temperature=0.0):
    #@markdown The only difference is divided by $\theta$
    #@markdown
    #@markdown $\sigma(z_i)=\frac{\frac{e^{z_i}}{\theta}}{\sum^N_{j=0}\frac{e^{z_j}}{\theta}}$
    #@markdown
    #@markdown Notice, when temperature = 0.0 , it is the same as temperature = 1.0
    if temperature == 0.0:
        return softmax(z)
    #print("softmax_with_temperature")
    theta = temperature
    logits = np.asarray(z) / theta
    exponential_logits = np.exp(logits)
    probs = exponential_logits / np.sum(exponential_logits)
    return probs

In [None]:
#@title define a toy dataset: words and logits
#@markdown | logits | words |
#@markdown |:---:|:---:|
#@markdown | 1.0 | I |
#@markdown | 3.0 | boy |
#@markdown | 2.0 | hello |
#@markdown | 9.0 | LLM |
#@markdown | 8.0 | NLP |
#@markdown | 4.0 | N-gram |
logits = [1.0, 3.0, 2.0, 9.0, 8.0, 4.0]
words = ['I', 'boy', 'hello', 'LLM', 'NLP', 'N-gram']

In [None]:
#@title temperature_random_sampling(logits, words, temperature)
def temperature_random_sampling(logits, words, temperature=0.0, print_words=False):
  #@markdown step 1: get logits, words
  #@markdown
  #@markdown step 2: soft-max with temperature-based probability
  probs =softmax_with_temperature(logits, temperature)
  if print_words:
      print('Logit to Prob:', list(map(list, zip(words, probs))))
  #sample using np.random.choice()
  #https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html
  #@markdown step 3: select one token randomly
  random_samp_index = np.random.choice(range(len(logits)), p=probs)
  print('Sampled word:', words[random_samp_index])

In [None]:
#@title run temperature_random_sampling 10 times, with temperature = 1.0
temperature = 1.0
for i in range(10):
  temperature_random_sampling(logits, words, temperature)

Sampled word: LLM
Sampled word: LLM
Sampled word: LLM
Sampled word: LLM
Sampled word: LLM
Sampled word: LLM
Sampled word: NLP
Sampled word: LLM
Sampled word: NLP
Sampled word: LLM


## LLM decode - Top_P

In [None]:
#@title define top_index(probs, top_p)
#@markdown sort based on probs, pick the first N with sum(prob) <= top_p
def top_p_indexes(probs, top_p):
  sorted_index = np.argsort(probs)[::-1]
  cum_prob = 0.0
  selected_index = []
  for i in sorted_index:
    if cum_prob <= top_p:
      selected_index.append(i)
      cum_prob += probs[i]
    else:
      break
  return selected_index

In [None]:
#@title define top_p_sampling(logits, words, top_p)
def top_p_sampling(logits, words, top_p, print_words=False):
  #@markdown step 1, get logits
  probs = softmax(logits)
  #@markdown step 2, sort based on probability
  #@markdown
  #@markdown step 3, filter token where cummulative probability exceeds TOP_P
  selected_index = top_p_indexes(probs, top_p)
  #@markdown step 4, Recalculate probability using softmax filter token
  top_p_prob = softmax(probs[selected_index])
  if print_words:
    selected_words = []
    for i in selected_index:
      selected_words.append(words[i])
    print('Top P words and Normalized Prob:', list(map(list, zip(selected_words, top_p_prob))))
    #@markdown step 5, Random Sampling using final probability
    random_samp_index = np.random.choice(selected_index, p=top_p_prob)
    print('Sampled word:', words[random_samp_index])


In [None]:
top_p = 0.9
top_p_sampling(logits, words, top_p, print_words=True)

Top P words and Normalized Prob: [['LLM', 0.6126841114435158], ['NLP', 0.3873158885564842]]
Sampled word: NLP


## LLM decode - Top_K

In [None]:
#@title top_k_sampling(logits, words, top_k)
#select the top_k's index, then sample from them

def top_k_sampling(logits, words, top_k, print_words=False):
  #@markdown step 1, Get logits and words

  #@markdown step 2, Calculate probability using soft-max
  probs = softmax(logits)

  #@markdown step 3, Sort based on Probability

  #@markdown step 4, Filter TOP_K tokens
  selected_index = np.argsort(probs)[::-1][:top_k]

  #@markdown step 5, Recalculate probability using soft-max on filtered token
  top_k_prob = softmax(probs[selected_index])
  if print_words:
    selected_words = []
    for i in selected_index:
      selected_words.append(words[i])
    print('Top K words and Normalized Prob:', list(map(list, zip(selected_words, top_k_prob))))
    #@markdown step 6, Random Sampling using final probability
    random_samp_index = np.random.choice(selected_index, p=top_k_prob)
    print('Sampled word:', words[random_samp_index])

In [None]:
top_k = 3
top_k_sampling(logits, words, top_k, print_words=True)

Top K words and Normalized Prob: [['LLM', 0.47200693217709], ['NLP', 0.29838505834640666], ['N-gram', 0.22960800947650348]]
Sampled word: NLP


##Greedy decode

This function essentially performs a greedy selection of the token candidate with the highest logit value. It's commonly used in machine learning models for tasks like text generation, where the model needs to choose the most probable token to continue generating text.

https://github.com/yui0/slibs/blob/4ecfe3824fd0f848f57216b800578090a3df353d/ggml/llama/llama.cpp#L2350

```python
def llama_sample_token_greedy(ctx: LlamaContext, candidates: List[LlamaTokenData]) -> int:
    t_start_sample_us = ggml_time_us()  # Assuming ggml_time_us() is defined elsewhere

    # Find max element
    max_data = max(candidates, key=lambda x: x.logit)
    result = max_data.id

    if ctx:
        ctx.t_sample_us += ggml_time_us() - t_start_sample_us
        ctx.n_sample += 1

    return result
```