In [None]:
from transformers import (
    AutoTokenizer, # language models
    AutoModelForCausalLM,
    Trainer, # fine-tuning
    TrainingArguments,
    DataCollatorForLanguageModeling, # part of pipeline responsible for assembling
    BitsAndBytesConfig # BitsAndBytes for quantum compression, less data size for models, yet not losing in speed at all
    )

import torch # pyTorch

from datasets import Dataset # converting text file in dataset (increases file reading speed)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model # (Parameter Efficient Fine Tuning) -> we take LoRA only
import torch.optim.adam # Adaptive Moment Estimation (Adam)

# !pip install -U bitsandbytes

In [None]:
def import_model(model_name):
  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16, ## gpu support
    device_map="auto",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True) # bitsAndBytes
    )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token # in case some custom models dont have pad_token by default
  return model, tokenizer

models = ["microsoft/phi-2", "mistralai/Mistral-7B-Instruct-v0.3"]
# phi_model, phi_tokenizer = import_model(models[0])
mistral_model, mistral_tokenizer = import_model(models[1])

In [None]:
def file_read(file_name):
  with open(file_name, 'r') as file:
    file = file.read()
    # dataset = [sentences[i:i+40] for i in range(0, len(sentences), 40)] # aint work, there's going to be an error, to large dataset, need to use Dataset
  dataset = Dataset.from_dict({"text": file.split("\n\n")})
  return dataset

dataset = file_read("smaller_size")

def make_token_func(tokenizer, max_length=512): # a constructor for different tokenizers
    def token_func(example):
        return tokenizer(
            example["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
    return token_func

token_func = make_token_func(mistral_tokenizer)
token_dataset = dataset.map(token_func, batched=True)

In [None]:
def lora_training(model):
  model = prepare_model_for_kbit_training(model)
  lora_config = LoraConfig(
    r=8, # a rank, the bigger the rank, more accuracy we get, but becomes slower
    lora_alpha=42, # an influence of LoRA on a model
    target_modules=["q_proj", "v_proj"], # there are modules we touch to change, q_proj = query, v_proj = value
    lora_dropout=0.05, # in order to avoid overtraining
    bias="none", # means do not touch bias
    task_type="CAUSAL_LM" # model wise
  )
  model = get_peft_model(model, lora_config)
  model.gradient_checkpointing_enable()
  return model

mistral_model = lora_training(mistral_model)


In [None]:
def model_train(model, tokenizer, dataset):
  data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
  )
  training_args = TrainingArguments(
      output_dir=f"./{model.__class__.__name__}-continued-pretrain",
      overwrite_output_dir=True,
      max_steps=1000,
      per_device_train_batch_size=1,
      save_steps=250,
      save_total_limit=1,
      prediction_loss_only=True,
      fp16=False,  # Disable fp16 for T4
      learning_rate=5e-6,
      logging_steps=50,
  )
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset,
      tokenizer=tokenizer,
      data_collator=data_collator,
  )
  trainer.train()

model_train(mistral_model, mistral_tokenizer, token_dataset)