# Overview

Let's load GPTQ model by using Huggingface transformers.

In [1]:
!pip install -U -q transformers==4.39.3
!pip install -U -q bitsandbytes==0.43.0
!pip install -U -q accelerate==0.28.0
!pip install -U -q einops==0.7.0
!pip install -U -q optimum==1.18.0
!pip install -U -q auto-gptq==0.7.1

In [2]:
from transformers import AutoModelForCausalLM

model_name='TheBloke/phi-2-orange-GPTQ'

model=AutoModelForCausalLM.from_pretrained(model_name, revision='main', device_map='auto', trust_remote_code=True)
model.device

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

2024-04-07 01:40:55.831491: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-07 01:40:55.831602: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-07 01:40:55.993797: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

device(type='cuda', index=0)

In [3]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(model_name, max_length=100, use_fast=True)
tokenizer

tokenizer_config.json:   0%|          | 0.00/7.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


CodeGenTokenizerFast(name_or_path='TheBloke/phi-2-orange-GPTQ', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50257: AddedToken("                               ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("                              ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("                             ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("                            ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=Fa

In [4]:
from transformers import TextStreamer

streamer=TextStreamer(tokenizer, skip_prompt=False, skip_special_tokens=False)
streamer

<transformers.generation.streamers.TextStreamer at 0x7ac58e7ebeb0>

# Inference with model

In [5]:
prompt="The weather in Melbourne is"
system_message="You are a weather reporter"
prompt_template=f'''{prompt}'''

input_ids=tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
# https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=50)
print(tokenizer.decode(output[0]))

The weather in Melbourne is getting colder.
    The temperature is expected to drop by 5 degrees Celsius next week.
    If the current temperature is 20 degrees Celsius, what will be the temperature next week?
    '''
    current_temperature = 20



In [6]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1,
    streamer=streamer, 
    device_map='auto'
)
pipe.device

device(type='cuda', index=0)

In [7]:
print(pipe(prompt_template)[0]['generated_text'])

The weather in Melbourne is getting warmer, and the temperature reaches 30 degrees Celsius (86°F). The sun is shining brightly, and it's a perfect day for outdoor activities. Sarah decides to go for a swim at the beach with her friends.

As she walks
The weather in Melbourne is getting warmer, and the temperature reaches 30 degrees Celsius (86°F). The sun is shining brightly, and it's a perfect day for outdoor activities. Sarah decides to go for a swim at the beach with her friends.

As she walks
