## Import Models

In [1]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from sampling.base_decoding import autoregressive_generate
from sampling.codec_base_decoding import autoregressive_generate_encoder_decoder
from sampling.speculative_decoding import speculative_generate
from ngram_assisted.ngram_assisted import ngram_assisted_speculative_generate
from ngram_assisted.ngram_storage import NGramStorage
from utils.logits_processor import (
    GreedyProcessor,
    MultinomialProcessor,
    NucleusProcessor,
    TopKNucleusProcessor,
    TopKProcessor,
    MCMCProcessor,
    SMCProcessor,
    SDEProcessor,
    VESDEProcessor,
    TypicalProcessor,
    DynamicTempProcess,
    AdaptiveTopKProcessor,
    TwoStageSamplingProcessor
)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

drafter_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# drafter_model = "gpt2"
drafter_quantize = QuantoConfig(
    weights="int8"
)  # QuantoConfig(weights="int8") None

target_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# target_model = "gpt2-medium"
target_quantize = QuantoConfig(
    weights="int8"
)  # QuantoConfig(weights="int8")  None
tokenizer_name = target_model
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name, trust_remote_code=True
)

drafter = AutoModelForCausalLM.from_pretrained(
    drafter_model,
    quantization_config=drafter_quantize,
    device_map=device,
    trust_remote_code=True,
)
drafter.eval()

target = AutoModelForCausalLM.from_pretrained(
    target_model,
    quantization_config=target_quantize,
    device_map=device,
    trust_remote_code=True,
)
target.eval()


cpu


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QLinear(in_features=2048, out_features=2048, bias=False)
          (k_proj): QLinear(in_features=2048, out_features=256, bias=False)
          (v_proj): QLinear(in_features=2048, out_features=256, bias=False)
          (o_proj): QLinear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): QLinear(in_features=2048, out_features=5632, bias=False)
          (up_proj): QLinear(in_features=2048, out_features=5632, bias=False)
          (down_proj): QLinear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary

In [3]:


end_tokens = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]  # "<|eot_id|>" is the end of turn token for Llama model.




In [4]:
print(end_tokens)

[2, 0]


In [5]:
prefix = "What is the capital city of Canada ?"
tokenized = tokenizer(prefix, return_tensors="pt").input_ids[0].tolist()

In [6]:
print(tokenized)

[1, 1724, 338, 278, 7483, 4272, 310, 7400, 1577]


### Autoregression Method

In [15]:
start_time = time.time()
output_ids = autoregressive_generate(
    tokenized,
    target,
    # use_cache=self.cache,
    # max_gen_len=100,
    eos_tokens_id=end_tokens,
    logits_processor= TwoStageSamplingProcessor(temperature=0.8, top_k = 10),
    # logits_processor= MultinomialProcessor(temperature=1),
    # logits_processor= VESDEProcessor()
    debug=True,
)

end_time = time.time()
output = tokenizer.decode(output_ids, skip_special_tokens=True)
print(output)
base_throughput = len(output) / (end_time - start_time)
print(f"Throughput: {base_throughput:.1f} tokens/s")


[31mEnd token found at position 9[0m

Throughput: 0.0 tokens/s


### Speculative Method

In [16]:
spec_start_time = time.time()
output_ids, accept_rate = speculative_generate(
    tokenized,
    drafter,
    target,
    tokenizer=tokenizer,
    logits_processor= TwoStageSamplingProcessor(temperature=0.8, top_k = 10),
    # gamma=gamma,
    max_gen_len=20,
    # eos_tokens_id=end_tokens,
    # debug=debug,
    # use_cache=cache,
)
spec_end_time = time.time()
spec_output = tokenizer.decode(output_ids, skip_special_tokens=True)
print( spec_output)
print(f"Acceptance rate: {accept_rate:.3f}")
spec_throughput = len(spec_output) / (spec_end_time - spec_start_time)
print(f"Throughput: {spec_throughput:.1f} tokens/s")



<|assistant|>
The capital city of Canada is Ottawa.
Acceptance rate: 1.000
Throughput: 2.3 tokens/s


### Ngram specukative Method

In [18]:

ngram = NGramStorage(n=4, vocab_size=target.config.vocab_size)
ngram_start_time = time.time()
output_ids, accept_rate = ngram_assisted_speculative_generate(
    tokenized,
    ngram,
    target,
    tokenizer=tokenizer,
    # filler_top_k=top_k_filler,
    logits_processor= TwoStageSamplingProcessor(temperature=0.7 , top_k=4, noise_scale=0.4),
    max_gen_len=20,
    # eos_tokens_id=end_tokens,
    # debug=debug,
    # use_cache=cache,
    first_target=True,
    stop_if_unknown=True,
)
ngram_end_time = time.time()
ngram_output = tokenizer.decode(output_ids, skip_special_tokens=True)
print(ngram_output)
print(f"Acceptance rate: {accept_rate:.3f}")
ngram_throughput = len(ngram_output) / (ngram_end_time - ngram_start_time)
print(f"Throughput: {ngram_throughput:.1f} tokens/s")


<|assistant|>
The capital city of Canada is Ottawa.
Acceptance rate: 0.400
Throughput: 3.4 tokens/s
