In [None]:
# %pip install -r requirements.txt

In [None]:
from transformers import (
    AutoTokenizer, # language models
    AutoModelForCausalLM,
    Trainer, # fine-tuning
    TrainingArguments,
    DataCollatorForLanguageModeling, # part of pipeline responsible for assembling
    BitsAndBytesConfig, # BitsAndBytes for quantum compression, less data size for models, yet not losing in speed at all
    )

import torch # pyTorch
import safetensors.torch

from datasets import Dataset # converting text file in dataset (increases file reading speed)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model # (Parameter Efficient Fine Tuning) -> we take LoRA only

In [None]:
def import_model(model_name):
  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16, ## gpu support
    device_map="auto",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True) # bitsAndBytes
    )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token # in case some custom models dont have pad_token by default
  return model, tokenizer

models = ["deepseek-ai/deepseek-llm-7b-base", "mistralai/Mistral-7B-Instruct-v0.3", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-2"]
model, tokenizer = import_model(models[0])

In [None]:
def file_read(file_name):
  with open(file_name, 'r') as file:
    file = file.read()
  dataset = Dataset.from_dict({"text": file.split("\n\n")})
  return dataset

dataset = file_read("church_text")

def make_token_func(tokenizer, max_length=512): # a constructor for different tokenizers
    def token_func(example):
        return tokenizer(
            example["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
    return token_func

token_func = make_token_func(tokenizer)
token_dataset = dataset.map(token_func, batched=True)


In [None]:
def lora_training(model):
  model = prepare_model_for_kbit_training(model)
  lora_config = LoraConfig(
    r=8, # a rank, the bigger the rank, more accuracy we get, but becomes slower
    lora_alpha=42, # an influence of LoRA on a model
    target_modules=["q_proj", "v_proj"], # there are modules we touch to change, q_proj = query, v_proj = value
    lora_dropout=0.05, # in order to avoid overtraining
    bias="none", # means do not touch bias
    task_type="CAUSAL_LM" # model wise
  )
  model = get_peft_model(model, lora_config)
  model.gradient_checkpointing_enable()
  return model

model = lora_training(model)


In [None]:
def model_train(model, tokenizer, dataset):
  data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
  )

  # adam is working under the hood by default
  training_args = TrainingArguments(
      output_dir=f"./pretrain-max-steps",
      overwrite_output_dir=True,
      max_steps=1000,
      per_device_train_batch_size=10, # количество рассмотренных обьектов за один раз -> усреднение -> лучшая точность
      save_steps=50,
      save_total_limit=1,
      prediction_loss_only=True,
      fp16=True,
      learning_rate=5e-6,
      logging_steps=20,
  )
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset,
      tokenizer=tokenizer,
      data_collator=data_collator,
  )
  trainer.train()

model_train(model, tokenizer, token_dataset)

In [None]:
def lora_adapters_attach(model, checkpoint):
  adapters_weights = safetensors.torch.load_file(f"./llama-continued-pretrain-max_steps{1000}/checkpoint-{checkpoint}/adapter_model.safetensors")
  model.load_state_dict(adapters_weights, strict=False)
  return model

model = lora_adapters_attach(model, 1000)

In [None]:
def gen(question, model, tokenizer):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=500,
        num_return_sequences=1,
        do_sample=False, ## variety, turn off for now
        top_p=0.95,
        temperature=0.7, ## temp
        pad_token_id=tokenizer.eos_token_id,
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer:")[-1].strip()
    return answer

gen("prompt", model, tokenizer)

In [None]:
import shutil
from google.colab import files
def save_trained_model(model, checkpoint):
  file = shutil.make_archive(f"{model.__class__.__name__}-checkpoint", 'zip', f"./pretrain-max_steps/checkpoint-{checkpoint}/")
  files.download(file)

save_trained_model(model, 1000)