In [6]:
# text_generation_api.py
# LLama2 chat 13b 

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import torch
from typing import Optional
from transformers import BitsAndBytesConfig
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

load_in_4bit = True
model_path = "/home/llama/models/base_models/Mixtral-8x7B-Instruct-v0.1"


device = "cuda" if torch.cuda.is_available() else "cpu"
print (device)
torch.cuda.set_device(1)  # Set the gpu output

app = FastAPI()
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
quantize_config = BaseQuantizeConfig(bits=4,group_size=128,damp_percent=0.01,desc_act=False)
model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config)
# model = AutoModelForCausalLM.from_pretrained(
#     model_path, device_map=device, quantization_config = nf4_config
# )

dynamic_text_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            model_kwargs=
                {
                    "load_in_4bit": load_in_4bit,
                    "device_map" : device,
                },
        )


cuda


Loading checkpoint shards: 100%|██████████| 19/19 [00:12<00:00,  1.55it/s]
The model 'MixtralGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'O

In [7]:
dynamic_text_pipeline.model.device

device(type='cpu')

In [None]:
class InputTextWithParams(BaseModel):
    text: str
    max_new_tokens: Optional[int] = None
    max_length: Optional[int] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    do_sample: Optional[bool] = None
    repetition_penalty: Optional[float] = None

@app.post("/generate-text")
async def generate_text(input_data: InputTextWithParams):
    try:
        generated_text = dynamic_text_pipeline(input_data.text, max_new_tokens= input_data.max_new_tokens, do_sample=input_data.do_sample, temperature=input_data.temperature, top_p=input_data.top_p)

        return {"result": generated_text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
 

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mixtral-8x7B-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-128g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

prompt = "Write a story about llamas"
system_message = "You are a story writing assistant"
prompt_template=f'''{prompt}
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])


  from .autonotebook import tqdm as notebook_tqdm
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




*** Generate:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> Write a story about llamas

## A Llama’s Tale

Once upon a time, there was a little llama named Llama. He lived on a farm with his family and friends. Every day, Llama would go out into the fields and graze on the lush green grass. He loved the feeling of the soft blades tickling his tongue as he chewed.

One day, Llama was out in the field with his friend, Llama 2. They were both munching on some delicious grass when they heard a loud noise. It sounded like a car horn.

Llama and Llama 2 looked up and saw a big truck coming down the road. The truck was full of people, and they were all waving and cheering.

“What’s going on?” Llama asked Llama 2.

“I don’t know,” Llama 2 replied. “But it looks like they’re having a lot of fun.”

Just then, the truck came to a stop right in front of the two llamas. The people in the truck jumped out and started to set up a big tent.

“What are they doing?” Llama asked Llama 2.

“I don’t know,” Llama 2 replied. “But it looks like they’re going to ha

In [2]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.set_device(1)  # Set the gpu output
model_path = "/home/llama/Personal_Directories/srb/causalllm-main/model/Mixtral-8x7B-v0.1-GPTQ"
quantize_config = BaseQuantizeConfig(bits=4)
config = AutoConfig.from_pretrained(model_path)
config.quantization_config["use_exllama"] = False
config.quantization_config["bits"] = 4
model = AutoModelForCausalLM.from_pretrained(model_path, config = config, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(model_path, device_map=device) 

Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.25it/s]


In [None]:
prompt = "Write a story about llamas"
system_message = "You are a story writing assistant"
prompt_template=f'''{prompt}
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




*** Generate:
<s> Write a story about llamas

In 1977, in the city of Arica, Chile, a man was walking his two llamas down the street. He was a farmer and he had brought his llamas to the city to sell them. As he was walking, he saw a sign that said “Llama Park.” He was curious, so he decided to go in and see what it was all about.

When he went inside, he saw a large park with a bunch of llamas in it. He was surprised to see so many llamas in one place. He was also surprised to see that the park was so well-kept. The park had a lot of trees and a lot of grass. It was a beautiful place.

The man was happy to see that there were so many people who loved llamas. He was also happy to see that the park was so well-kept. He knew that the park was a good place for llamas to live.

The man decided to stay in the park for a while. He wanted to see how the park was run. He also wanted to see how the llamas were treated.

The man was impressed with the way the park was run. He was also impresse

NameError: name 'pipeline' is not defined

In [2]:
tokenizer.save_pretrained("model/Mixtral-8x7B-v0.1-GPTQ")
model.save_pretrained("model/Mixtral-8x7B-v0.1-GPTQ")

In [1]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
from typing import Optional
from accelerate import Accelerator
from accelerate.utils import gather_object
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
import torch

accelerator = Accelerator()
load_in_4bit = False
model_path = "/home/llama/models/base_models/Mixtral-8x7B-Instruct-v0.1"

# torch.cuda.set_device(1)  # Set the gpu output
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, config = config, device_map="auto",load_in_4bit=True)


  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards:   5%|▌         | 1/19 [00:00<00:17,  1.04it/s]


KeyboardInterrupt: 

In [2]:
# text_generation_api.py
# LLama2 chat 13b 

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import torch
from typing import Optional
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
 
model_path = "/home/llama/Personal_Directories/srb/causalllm-main/model/Llama-2-7b-chat-hf"
load_in_4bit = True

app = FastAPI()
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, config = config, device_map="auto",load_in_4bit=load_in_4bit)

dynamic_text_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer = tokenizer,
            device_map = "auto"
        )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00,  2.33it/s]
