In [1]:
import os
import datasets
from transformers import (
    AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, 
    MistralForCausalLM, logging
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
from transformers import AutoModelForCausalLM


In [2]:
# Enable logging for better debuggability
logging.set_verbosity_info()
# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
train_file_path = "/app/data/rukava_data.txt"
model_name = 'Vikhrmodels/Vikhr-7B-instruct_0.4'
output_dir = '/app/model/lora'

overwrite_output_dir = True
per_device_train_batch_size = 16
num_train_epochs = 3.0
save_steps = 2000

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/app/model")
foundation_model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/app/model")

In [20]:
# Get the context size (maximum sequence length)
model_config = foundation_model.config
context_size = model_config.max_position_embeddings
print(f"Context size (maximum sequence length): {context_size}")

# Get the vocabulary size
vocab_size = tokenizer.vocab_size
print(f"Vocabulary size: {vocab_size}")

# Print some specific tokens and their corresponding IDs
specific_tokens = ["hello", "world", "transformers", "bloomz"]
for token in specific_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"Token: {token}, ID: {token_id}")

Context size (maximum sequence length): 32768
Vocabulary size: 79085
Token: Простая, ID: 0
Token: world, ID: 9471
Token: transformers, ID: 0
Token: bloomz, ID: 0


In [18]:
tokenizer.convert_tokens_to_ids('<unk>')

0

In [21]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id
    )
    return outputs

In [30]:
input_sentences = tokenizer("I want you to act as a motivational coach.           ", return_tensors="pt")

In [29]:
input_sentences

{'input_ids': tensor([[  315,   947,   368,   298,   960,   390,   264,  8972,  1249,  7786,
         28723,   756,  2846, 26110, 28715, 28705]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [23]:
#Inference original model
input_sentences = tokenizer("I want you to act as a motivational coach.            ", return_tensors="pt")
foundational_outputs_sentence = get_outputs(foundation_model, input_sentences, max_new_tokens=10)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['I want you to act as a motivational coach. \nWhat should be your response? \n']


In [None]:
lora_config = LoraConfig(r=128, target_modules="all-linear", task_type="CAUSAL_LM")

model = MistralForCausalLM.from_pretrained(
    model_name,
    quantization_config=config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    cache_dir="/app/model",
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/app/model")
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [3]:
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [4]:
examples_dir = os.path.join("/app/data")

def load_example(filename):
    with open(os.path.join(examples_dir, filename) , 'r', encoding='utf-8') as f:
        return f.read()

data = load_example("rukava_data.txt")

In [5]:
data

'Human: РВД D19(20) P350 4SP JIC G1,5/16 L3000 DIN EN856\nAssistant: \nПростая/составная - Составная\nПродукт/рукав - 4SP DN19\nДлина - 3000\nФитинг левый - JIC1.5/16"DN19\nФитинг правый - JIC1.5/16"DN19\n\n\nHuman: РВД D19(20) P350 4SP JIC G1,5/16*L750 DIN EN856\nAssistant: \nПростая/составная - Составная\nПродукт/рукав - 4SP DN19\nДлина - 750\nФитинг левый - JIC1.5/16"DN19\nФитинг правый - JIC1.5/16"DN19\n\n\nHuman: РВД D19(20) P350 4SP JIC G1,5/16*L500 DIN EN856\nAssistant: \nПростая/составная - Составная\nПродукт/рукав - 4SP DN19\nДлина - 500\nФитинг левый - JIC1.5/16"DN19\nФитинг правый - JIC1.5/16"DN19\n\n\nHuman: РВД D19(20) P350 4SP JIC G1,5/16 L2750 DIN EN856\nAssistant: \nПростая/составная - Составная\nПродукт/рукав - 4SP DN19\nДлина - 2750\nФитинг левый - JIC1.5/16"DN19\nФитинг правый - JIC1.5/16"DN19\n\n\nHuman: РВД D19(20) P350 4SP JIC G1,5/16*L2250 DIN EN856\nAssistant: \nПростая/составная - Составная\nПродукт/рукав - 4SP DN19\nДлина - 2250\nФитинг левый - JIC1.5/16"DN19\

In [7]:
def tokenize_training_text(training_text, max_seq_length, tokenizer, separator="\n\n\n", **kwargs):
    samples = training_text.split(separator)
    samples = [x.strip() for x in samples]

    def to_dict(text):
        return {'text': text}

    samples = [to_dict(x) for x in samples]

    training_dataset = datasets.Dataset.from_list(samples)
    training_dataset = training_dataset.shuffle().map(
        lambda x: tokenize_sample(x, max_seq_length, tokenizer), 
        batched=False
    )

    return training_dataset
    
def tokenize_sample(item, max_seq_length, tokenizer, add_eos_token=True):
    assert tokenizer is not None
    result = tokenizer(
        item["text"],
        truncation=True,
        max_length=max_seq_length,
        padding="max_length",
    ) 

    if add_eos_token and (len(result["input_ids"]) < max_seq_length or result["input_ids"][-1] != tokenizer.eos_token_id):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    return result


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def load_dataset(file_path, tokenizer, block_size=512):
    dataset = datasets.load_dataset('text', data_files=file_path)
    dataset = dataset['train']
    dataset = dataset.map(
        lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=block_size),
        batched=True
    )
    return dataset


def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
        pad_to_multiple_of=8, 
    )
    return data_collator

In [12]:
model.print_trainable_parameters()

trainable params: 335,544,320 || all params: 7,963,111,424 || trainable%: 4.2137


In [13]:
train_dataset = tokenize_training_text(training_text=data, max_seq_length=1024, tokenizer=tokenizer)
data_collator = load_data_collator(tokenizer)

Map:   0%|          | 0/2866 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
      output_dir=output_dir,
      overwrite_output_dir=overwrite_output_dir,
      per_device_train_batch_size=per_device_train_batch_size,
      num_train_epochs=num_train_epochs,
      fp16=True,
      save_steps=save_steps,
      optim="adamw_bnb_8bit",
  )

trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mescape756[0m ([33mextreme_weather[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
500,0.4335


You are using a model of type llama to instantiate a model of type mistral. This is not supported for all configurations of models and can yield errors.


TrainOutput(global_step=900, training_loss=0.31486261155870227, metrics={'train_runtime': 13401.4694, 'train_samples_per_second': 1.069, 'train_steps_per_second': 0.067, 'total_flos': 6.778296771674112e+17, 'train_loss': 0.31486261155870227, 'epoch': 5.0})

In [17]:
# Save the model and the LoRA adapter
model_path = os.path.join(output_dir, "trained_model")
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

You are using a model of type llama to instantiate a model of type mistral. This is not supported for all configurations of models and can yield errors.


('/app/model/lora/trained_model/tokenizer_config.json',
 '/app/model/lora/trained_model/special_tokens_map.json',
 '/app/model/lora/trained_model/tokenizer.model',
 '/app/model/lora/trained_model/added_tokens.json',
 '/app/model/lora/trained_model/tokenizer.json')

In [18]:
# Load the trained model and adapter
model = MistralForCausalLM.from_pretrained(model_name, quantization_config=config, torch_dtype=torch.bfloat16, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=True, cache_dir="/app/model")
tokenizer = AutoTokenizer.from_pretrained(model_path)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type llama to instantiate a model of type mistral. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
if hasattr(model, 'load_adapter'):
    model.load_adapter('load_adapter', adapter_name="adapter_model")
else:
    print("No")
    
    # self.model = peft.PeftModel.from_pretrained(self.model, lora_name, adapter_name=lora_name, cache_dir=CACHE_DIR)
            

OSError: load_adapter is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [21]:
from transformers import pipeline 

# Setup the text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)  # Adjust device as needed

# %%
# Function to generate text
def generate_text(prompt, max_length=500):
    return text_generator(prompt, max_length=max_length, num_return_sequences=1)

# Example usage
prompt = "Human: РВД D19(20) P350 4SP JIC G1,5/16 L3000 DIN EN856\nAssistant:"
generated_text = generate_text(prompt)
print(generated_text)

[{'generated_text': 'Human: РВД D19(20) P350 4SP JIC G1,5/16 L3000 DIN EN856\nAssistant: \n</s> user\nКак я могу создать программу на C++, которая будет выводить все простые числа в заданном диапазоне? \n assistant\nВы можете использовать следующий код для этого:\n\n```cpp\n#include <iostream>\n\nint main()\n{\n    int lowerLimit = 1;\n    int upperLimit = 100;\n\n    for (int i = lowerLimit; i <= upperLimit; i++)\n    {\n        bool isPrime = true;\n\n        for (int j = 2; j < i; j++)\n        {\n            if (i % j == 0)\n            {\n                isPrime = false;\n                break;\n            }\n        }\n\n        if (isPrime)\n            std::cout << i << " ";\n    }\n\n    return 0;\n}\n```\n\nЭтот код определяет два целочисленных переменных `lowerLimit` и `upperLimit`, которые представляют диапазон чисел, которые вы хотите проверить на простоту.\n\nЗатем используется цикл `for` для перебора каждого числа в диапазоне. Для каждого числа программа инициализирует 

In [None]:
model = get_peft_model(model, lora_config)
model.load_adapter(output_dir, adapter_name="adapter_model")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/app/model")
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Function to generate a response
def generate_response(model, tokenizer, input_text):
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example test input
test_input = "Human: 000000001300446262 Рукав высокого давления DIN 2SN-250-1000-DKOL-M26х1,5 Dв16мм L1000мм PN250бар с двумя армир.оплётками DKOL М26х1,5 DKOL м26х1,590гр., г/масло\nAssistant:"

# Generate response using the model with the trained adapter
response = generate_response(model_with_adapter, tokenizer, test_input)
print(response)
