In [1]:
env HF_HOME=/teamspace/studios/this_studio/hfcache

env: HF_HOME=/teamspace/studios/this_studio/hfcache


In [2]:
MODEL_ID = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

def load_model_and_tokenizer():
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        fsdp_plugin = FullyShardedDataParallelPlugin(
            state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
            optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
        )
        accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=quantization_config, device_map="auto")
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    return model, tokenizer

def create_text_generation_pipeline(model, tokenizer):
    generate_text = pipeline(
        model=model,
        tokenizer=tokenizer,
        return_full_text=False,
        task="text-generation",
        do_sample=True,
        temperature=0.1,
        top_p=0.15,
        top_k=0,
        max_new_tokens=4096,
        repetition_penalty=1.1,
    )
    return generate_text



In [4]:
model, tokenizer = load_model_and_tokenizer()
generate_text = create_text_generation_pipeline(model, tokenizer)


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
tmplt_messages = tokenizer.apply_chat_template(
    rag_append_user_input(messages_, query, context), tokenize=False, add_generation_prompt=True
)

In [None]:
messages = [
    {"role": "system", "content": "You are Hermes 2."},
    {"role": "user", "content": "Hello, who are you?"}
]
gen_input = tokenizer.apply_chat_template(message, return_tensors="pt")
model.generate(**gen_input)