In [1]:
!pip install peft datasets

[0m

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import peft
import torch
from datasets import load_dataset



In [3]:
def print_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        print(f'VRAM allocated {allocated}gb, reserved {reserved}gb')
    else:
        print('No cuda')

In [4]:
model_name = 'BioMistral/BioMistral-Safetensors'
tokenizer = AutoTokenizer.from_pretrained(model_name)
foundation_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", attn_implementation="flash_attention_2")

In [93]:
print(tokenizer.chat_template) # Проверка на поддержку чата

{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}


In [5]:
def get_outputs(model, inputs, max_new_tokens=1000):
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5,
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [6]:
print_memory()

VRAM allocated 13.48877763748169gb, reserved 13.615234375gb


In [7]:
input_simple_sentences_text = 'I have high temperature. Which pills should I take? Answer with pills, no additional info. Pills:'
input_simple_sentences = tokenizer(input_simple_sentences_text, return_tensors='pt').to(foundation_model.device)
foundational_outputs_sentence = get_outputs(foundation_model, input_simple_sentences)
output = tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [8]:
output[0]

'I have high temperature. Which pills should I take? Answer with pills, no additional info. Pills: paracetamol 500 mg (1 tablet) every six hours for three days or ibuprofen 24-hour release capsule of 600mg one per day'

In [9]:
input_json_sentences_text = 'I have high temperature. Which pills should I take? Answer with json, contining pills. Example ["carbamazepine", "lamotrigine"]. Your answer in json: '
input_json_sentences = tokenizer(input_json_sentences_text, return_tensors='pt').to(foundation_model.device)
foundational_outputs_sentence = get_outputs(foundation_model, input_json_sentences)
output = tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [10]:
output[0]

'I have high temperature. Which pills should I take? Answer with json, contining pills. Example ["carbamazepine", "lamotrigine"]. Your answer in json: '

# Начинаем готовить LORA.

In [94]:
dataset_name = 'mlabonne/FineTome-100k'

In [95]:
data = load_dataset(dataset_name)
data

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100000
    })
})

In [104]:
def convert_to_openai(sample):
    conversation_plain = ''
    conversation = sample['conversations']
    messages = [{"role":"system", "content":"You are a helpful assistant. Follow the user's instructions precisely."}]
    for speech in conversation:
        msg_role = ''
        if speech['from'] == 'human':
            msg_role = 'user'
        if speech['from'] == 'gpt':
            msg_role = 'assistant'

        msg_content = speech['value']

        messages.append({'role': msg_role, 'content': msg_content})

        obj = {'messages': messages}
        
    return {'openai_chat': obj}
    

In [105]:
data.map(test_json)

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score', 'openai_chat'],
        num_rows: 100000
    })
})