In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
import copy, os, torch, gc

In [None]:
cache_dir = "/speed-scratch/ra_mdash/tmp/huggingface"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

HUGGINGFACE_TOKEN = ""

torch.cuda.set_device(0)
gc.collect()
torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Use FP16 for computations
    # bnb_4bit_quant_type="nf4",            # Normal Float 4 (NF4) data type
    bnb_4bit_use_double_quant=False,        # Enable double quantization
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading for 32-bit modules
)

tokenizer = AutoTokenizer.from_pretrained(
    "facebook/Self-taught-evaluator-llama3.1-70B",
    subfolder="dpo_model",
    cache_dir=cache_dir,
    use_auth_token=HUGGINGFACE_TOKEN
)

model = AutoModelForCausalLM.from_pretrained(
    "facebook/Self-taught-evaluator-llama3.1-70B",
    subfolder="dpo_model",
    device_map="auto",
    cache_dir=cache_dir,
    use_auth_token=HUGGINGFACE_TOKEN,
    quantization_config=bnb_config
)




Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [7]:
SELF_TAUGHT_WITH_SYSTEM_PROMPT = [
    {
        "role": "system",
        "content": 'Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user\'s instructions and answers the user\'s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \\"[[A]]\\" if assistant A is better, \\"[[B]]\\" if assistant B is better.',
    },
    {
        "role": "user",
        "content": """[User Question]
{input}

[The Start of Assistant A's Answer]
{response_a}
[The End of Assistant A's Answer]

[The Start of Assistant B's Answer]
{response_b}
[The End of Assistant B's Answer]
""",
    },
]

example_inputs = {
    "input": "explain master slave replication nsql",
    "response_a": "In the context of NoSQL databases, master-slave replication refers to a configuration where a single master node writes data, and one or more slave nodes read data from the master and replicate it to provide read scalability. The master node is responsible for accepting write requests and updating its own data, while the slave nodes are responsible for replicating the data from the master and serving read requests.",
    "response_b": "In SQL, master-slave replication is a technique used to create a copy of a database on a separate server. The master server is the primary server that contains the original data, while the slave server is the secondary server that contains a copy of the data. The master server sends updates to the slave server, which then applies them to its own database."
}

In [8]:
conversation = copy.copy(SELF_TAUGHT_WITH_SYSTEM_PROMPT)
conversation[-1]["content"] = conversation[-1]["content"].format(**example_inputs)

tokenized_input = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
gen_cfg = GenerationConfig(max_length=2048, do_sample=False)

judgement = model.generate(tokenized_input, gen_cfg)
judgement_text = tokenizer.decode(judgement.cpu().tolist()[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


AssertionError: 

In [13]:
tokenized_input.dtype

torch.int64