In [None]:
import os
import datasets
from transformers import (
    AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, 
    MistralForCausalLM, logging
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
from transformers import AutoModelForCausalLM


In [2]:
# Enable logging for better debuggability
logging.set_verbosity_info()
# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Load foundation model and tokenizer

In [3]:
train_file_path = "/app/data/rukava_data.txt"
model_name = 'Vikhrmodels/Vikhr-7B-instruct_0.4'
output_dir = '/app/model/lora'

overwrite_output_dir = True
per_device_train_batch_size = 16
num_train_epochs = 3.0
save_steps = 2000

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/app/model")
foundation_model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/app/model")

loading file tokenizer.model from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/tokenizer.model
loading file tokenizer.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/tokenizer.json
loading file added_tokens.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/added_tokens.json
loading file special_tokens_map.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/special_tokens_map.json
loading file tokenizer_config.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file confi

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at Vikhrmodels/Vikhr-7B-instruct_0.4.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "use_cache": false
}



In [5]:
# Get the context size (maximum sequence length)
model_config = foundation_model.config
context_size = model_config.max_position_embeddings
print(f"Context size (maximum sequence length): {context_size}")

# Get the vocabulary size
vocab_size = tokenizer.vocab_size
print(f"Vocabulary size: {vocab_size}")

# Print some specific tokens and their corresponding IDs
specific_tokens = ["hello", "world", "transformers", "bloomz"]
for token in specific_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"Token: {token}, ID: {token_id}")

Context size (maximum sequence length): 32768
Vocabulary size: 79085
Token: hello, ID: 21558
Token: world, ID: 9471
Token: transformers, ID: 0
Token: bloomz, ID: 0


In [7]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id
    )
    return outputs

In [None]:
#Inference original model
input_sentences = tokenizer("I want you to act as a motivational coach. ", return_tensors="pt")
foundational_outputs_sentence = get_outputs(foundation_model, input_sentences, max_new_tokens=50)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

### Create finetuning task

In [13]:
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [15]:
lora_config = LoraConfig(
    r=128,
    target_modules="all-linear",
    task_type="CAUSAL_LM"
)

model = MistralForCausalLM.from_pretrained(model_name,
                                           quantization_config=config,
                                           torch_dtype=torch.bfloat16,
                                           device_map="auto",
                                           low_cpu_mem_usage=True,
                                           trust_remote_code=True,
                                           cache_dir="/app/model")
# model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model,lora_config)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
loading configuration file config.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/config.json
You are using a model of type llama to instantiate a model of type mistral. This is not supported for all configurations of models and can yield errors.
Model config MistralConfig {
  "_name_or_path": "output1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_windo

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at Vikhrmodels/Vikhr-7B-instruct_0.4.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /app/model/models--Vikhrmodels--Vikhr-7B-instruct_0.4/snapshots/dd960329b333cbe7f67396c5fe715f477b997639/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "use_cache": false
}



In [16]:
model.print_trainable_parameters()

trainable params: 335,544,320 || all params: 7,963,111,424 || trainable%: 4.2137


### Load data

In [17]:
examples_dir = os.path.join("/app/data")

def load_example(filename):
    with open(os.path.join(examples_dir, filename) , 'r', encoding='utf-8') as f:
        return f.read()

data = load_example("rukava_data.txt")

In [29]:
x = """Human: РВД D19(20) P350 4SP JIC G1,5/16 L3000 DIN EN856
Assistant:
Простая/составная - Составная
Продукт/рукав - 4SP DN19
Длина - 3000
Фитинг левый - JIC1.5/16"DN19
Фитинг правый - JIC1.5/16"DN19"""

In [38]:
def tokenize_training_text(training_text, max_seq_length, tokenizer, separator="\n\n\n", **kwargs):
    samples_from = training_text.split(separator)
    
    samples_to=[]
    # samples = [x.strip() for x in samples]

    # def to_dict(text):
    #     return {'text': text}

    # samples = [to_dict(x) for x in samples]
    for x in samples_from:
        human_part, assistant_part = x.split("Assistant:")
        human_part.replace("Human: ", "")
        assistant_text = assistant_part.strip()
        samples_to.append({"human": human_part, "assistant": assistant_text})
        # for assistant_line in assistant_text.split("\n"):
        #     key, value = assistant_line.split(" - ")
            
        # samples_to.append(data)
    training_dataset = datasets.Dataset.from_list(samples_to)
    print(training_dataset)
    training_dataset = training_dataset.shuffle().map(
        lambda x: tokenize_sample(x, max_seq_length, tokenizer), 
        batched=False
    )

    return training_dataset
    
def tokenize_sample(item, max_seq_length, tokenizer, add_eos_token=True):
    assert tokenizer is not None
    result = tokenizer(
        item["text"],
        truncation=True,
        max_length=max_seq_length,
        padding="max_length",
    ) 

    if add_eos_token and (len(result["input_ids"]) < max_seq_length or result["input_ids"][-1] != tokenizer.eos_token_id):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    return result


In [19]:
def load_dataset(file_path, tokenizer, block_size=512):
    dataset = datasets.load_dataset('text', data_files=file_path)
    dataset = dataset['train']
    dataset = dataset.map(
        lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=block_size),
        batched=True
    )
    return dataset


def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
        
        pad_to_multiple_of=8, 
    )
    return data_collator

In [20]:
train_dataset = tokenize_training_text(training_text=data, max_seq_length=1024, tokenizer=tokenizer)
data_collator = load_data_collator(tokenizer)

Map:   0%|          | 0/2866 [00:00<?, ? examples/s]

In [39]:
train_dataset.__getitem__(2)

{'text': 'Human: РУКАВ НЕМЕТАЛЛИЧЕСКИЙ В СБОРЕ_12-4SP-2DKOS-(R)- M24X1.5-5600_ВЫСОКОГО ДАВЛЕНИЯ _DN12 мм_5600 мм \nAssistant: \nПростая/составная - Составная\nПродукт/рукав - 4SP DN12\nДлина - 5600\nФитинг левый - DKOS24x1,5DN12\nФитинг правый - DKOS24x1,5DN12',
 'input_ids': [2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,

In [40]:
training_args = TrainingArguments(
      output_dir=output_dir,
      overwrite_output_dir=overwrite_output_dir,
      per_device_train_batch_size=per_device_train_batch_size,
      num_train_epochs=num_train_epochs,
      fp16=True,
      save_steps=save_steps,
      optim="adamw_bnb_8bit",
  )

trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2,866
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 540
  Number o

In [17]:
# Save the model and the LoRA adapter
model_path = os.path.join(output_dir, "trained_model")
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

You are using a model of type llama to instantiate a model of type mistral. This is not supported for all configurations of models and can yield errors.


('/app/model/lora/trained_model/tokenizer_config.json',
 '/app/model/lora/trained_model/special_tokens_map.json',
 '/app/model/lora/trained_model/tokenizer.model',
 '/app/model/lora/trained_model/added_tokens.json',
 '/app/model/lora/trained_model/tokenizer.json')

In [None]:
# Load the trained model and adapter
model = MistralForCausalLM.from_pretrained(model_name,
                                           quantization_config=config,
                                           torch_dtype=torch.bfloat16,
                                           device_map="auto",
                                           low_cpu_mem_usage=True,
                                           trust_remote_code=True,
                                           cache_dir="/app/model")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
if hasattr(model, 'load_adapter'):
    model.load_adapter('load_adapter', adapter_name="adapter_model")
else:
    print("No")
    
    # self.model = peft.PeftModel.from_pretrained(self.model, lora_name, adapter_name=lora_name, cache_dir=CACHE_DIR)
            

OSError: load_adapter is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [8]:
from transformers import pipeline 

# Setup the text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)  # Adjust device as needed

# %%
# Function to generate text
def generate_text(prompt, max_length=500):
    return text_generator(prompt, max_length=max_length, num_return_sequences=1)

# Example usage
prompt = "Human: РВД D19(20) P350 4SP JIC G1,5/16 L3000 DIN EN856\nAssistant:"
generated_text = generate_text(prompt)
print(generated_text)

NameError: name 'model' is not defined

In [None]:
model = get_peft_model(model, lora_config)
model.load_adapter(output_dir, adapter_name="adapter_model")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/app/model")
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Function to generate a response
def generate_response(model, tokenizer, input_text):
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example test input
test_input = "Human: 000000001300446262 Рукав высокого давления DIN 2SN-250-1000-DKOL-M26х1,5 Dв16мм L1000мм PN250бар с двумя армир.оплётками DKOL М26х1,5 DKOL м26х1,590гр., г/масло\nAssistant:"

# Generate response using the model with the trained adapter
response = generate_response(model_with_adapter, tokenizer, test_input)
print(response)
