In [1]:
!pip install -U transformers huggingface_hub safetensors bitsandbytes>=0.46.1

#!CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python

In [2]:
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = 'BioMistral/BioMistral-Safetensors'
results = {}

In [4]:
def clean_memory():
    for var in ['model', 'tokenizer', 'inputs', 'llama']:
        if var in globals():
            del globals()[var]

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

    gc.collect()
    print('Memory is cleaned')

In [5]:
def print_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        print(f'VRAM allocated {allocated}gb, reserved {reserved}gb')
    else:
        print('No cuda')

In [6]:
clean_memory()
print_memory()

Memory is cleaned
VRAM allocated 0.0gb, reserved 0.0gb


In [7]:
clean_memory()
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, device_map="auto") #Необходимо явно задать параметры, иначе будет другой размер и может не загрузится в видеокарту
tokenizer = AutoTokenizer.from_pretrained(model_name)
print_memory()

input_text = 'My head is sick. Which pill should I drink?'
inputs = tokenizer(input_text, return_tensors='pt').to(model.device)

start = time.time()
output = model.generate(**inputs, repetition_penalty=1.2, max_new_tokens = 1000)
end = time.time()

new_tokens = len(output[0]) - len(inputs.input_ids[0])
tps = new_tokens / (end - start)
print(f'BF16: new Tokens: {new_tokens}, tokens per second: {tps}')

decoded_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_text)

results['BioMistral-7B-BF16'] = {'tps': tps, 'memory': torch.cuda.memory_allocated(0)/1024**3}

Memory is cleaned


Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:04<00:00, 63.37it/s, Materializing param=model.norm.weight]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


VRAM allocated 13.488778114318848gb, reserved 13.490234375gb
BF16: new Tokens: 49, tokens per second: 17.917794021592627
My head is sick. Which pill should I drink? (A) 10 mg of amoxicillin, (B) 250 mg of metronidazole, and (C) 400 mg of ciprofloxacin.


In [8]:
clean_memory()
print_memory()

Memory is cleaned
VRAM allocated 0.007935047149658203gb, reserved 0.021484375gb


In [9]:
clean_memory()
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)
print_memory()

input_text = 'My head is sick. Which pill should I drink?'
inputs = tokenizer(input_text, return_tensors='pt').to(model.device)

start = time.time()
output = model.generate(**inputs, repetition_penalty=1.2, max_new_tokens = 1000)
end = time.time()

new_tokens = len(output[0]) - len(inputs.input_ids[0])
tps = new_tokens / (end - start)
print(f'4BIT: new Tokens: {new_tokens}, tokens per second: {tps}')

decoded_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_text)

results['BioMistral-7B-4BIT'] = {'tps': tps, 'memory': torch.cuda.memory_allocated(0)/1024**3}

Memory is cleaned


Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:10<00:00, 27.45it/s, Materializing param=model.norm.weight]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


VRAM allocated 4.670655727386475gb, reserved 6.501953125gb
4BIT: new Tokens: 57, tokens per second: 16.646913308258494
My head is sick. Which pill should I drink? (A) 10 mg of amitriptyline, (B) 25 mg of carbamazepine, (C) 300 mg of valproate, or (D) 400 mg of lamotrigine.


In [10]:
clean_memory()

model_name_gguf = 'BioMistral/BioMistral-7B-GGUF'

model_path = hf_hub_download(
    repo_id=model_name_gguf,
    filename='ggml-model-Q4_K_M.gguf',
    local_dir="./models"
)

print('File downloaded')

llama = Llama(
    model_path=model_path,
    n_gpu_layers=-1,
    n_ctx=32768,
    verbose=False
)

start = time.time()
output = llama.create_chat_completion(
    messages=[{'role': 'user', 'content': 'My head is sick. Which pill should I drink?'}],
    max_tokens=1000,
    repeat_penalty=1.2,
)
end = time.time()
print(output)

new_tokens = output['usage']['completion_tokens']
tps = new_tokens / (end - start)
print(f'LLAMACPP: new Tokens: {new_tokens}, tokens per second: {tps}')

results['BioMistral-7B-LLAMACPP (GPU)'] = {'tps': tps, 'memory': 'uncountable'}

Memory is cleaned
File downloaded
{'id': 'chatcmpl-ea9bc4d5-6a84-40a4-bbb8-823b04cb5089', 'object': 'chat.completion', 'created': 1771271554, 'model': 'models/ggml-model-Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': ' I’m sorry to hear that. Can you please tell me more about how you are feeling? Are you experiencing a headache, fever or chills, nausea, vomiting, dizziness, or any other symptoms?'}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 19, 'completion_tokens': 48, 'total_tokens': 67}}
LLAMACPP: new Tokens: 48, tokens per second: 64.74210832660701


In [11]:
results

{'BioMistral-7B-BF16': {'tps': 17.917794021592627,
  'memory': 13.496714115142822},
 'BioMistral-7B-4BIT': {'tps': 16.646913308258494,
  'memory': 4.670657157897949},
 'BioMistral-7B-LLAMACPP (GPU)': {'tps': 64.74210832660701,
  'memory': 'uncountable'}}

In [16]:
dataset = load_dataset('omi-health/medical-dialogue-to-soap-summary')

Downloading readme: 2.15kB [00:00, 4.89MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 154M/154M [00:03<00:00, 44.2MB/s]
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8.31M/8.31M [00:00<00:00, 17.0MB/s]
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.52M/3.52M [00:00<00:00, 8.33MB/s]
Generating train split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9250/9250 [00:00

In [14]:
dataset['train']

Dataset({
    features: ['description', 'utterances'],
    num_rows: 482
})

In [21]:
dataset['train'][0]

{'dialogue': "Doctor: Hello, how can I help you today?\nPatient: My son has been having some issues with speech and development. He's 13 years old now.\nDoctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?\nPatient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.\nDoctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. \n(After the tests)\nDoctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?\nPatient: Yes, he has all of those features. His hands are also broad and short. And his feet have mild syndactyly of the second an