In [None]:
import os
import sys
os.environ["TRANSFORMERS_CACHE"] = "/workspace/cache/"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
from torch.nn import DataParallel
from utils.prompter import Prompter
from time import time
from time import perf_counter
from peft import PeftModel
import json

from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig, BitsAndBytesConfig

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
print("loading model")
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-30b-hf",
       load_in_8bit=True,
        device_map='auto',
        torch_dtype=torch.bfloat16,
    )
print("loaded peft")
model = PeftModel.from_pretrained(
            model,
            "./model/checkpoint-300",
            torch_dtype=torch.float16)
### model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)
prompter = Prompter("sum")

In [None]:
def inference(input_ids, model):
    generation_config = GenerationConfig(
        temperature=0,
        top_p=0.75,
        use_cache=False,
        do_sample=True
    )
    now = time()
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=100,
        )
    duration = time() - now
    try:
        s = generation_output.sequences[0]
    except:
        s = generation_output[0]

    tks = (s.shape[0] - input_ids.shape[1])/duration
    print(f"{tks} tokens/s")
    print(f"{1/tks} tokens/s")
    output = tokenizer.decode(s)
    res = prompter.get_response(output)
    return res

In [None]:
with open("eval.json", "r") as f:
    eval = json.load(f)

for i in len(eval):
    input = eval[i]
    instruction = "Résume ce texte issue d'un cours de droit en conservant les dates, les abréviations et les principes importants."
    prompt = prompter.generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    res = inference(input_ids, model)
    print(res)
    print("##############################################")
