In [2]:
# %pip install accelerate peft bitsandbytes transformers trl
# %pip install --upgrade huggingface_hub
%pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   - ------------------------------------- 30.7/991.5 kB 435.7 kB/s eta 0:00:03
   - ------------------------------------- 41.0/991.5 kB 326.8 kB/s eta 0:00:03
   -- ------------------------------------ 61.4/991.5 kB 409.6 kB/s eta 0:00:03
   ---- --------------------------------- 122.9/991.5 kB 554.9 kB/s eta 0:00:02
   ------- ------------------------------ 194.6/991.5 kB 841.6 kB/s eta 0:00:01
   ---------- --------------------------- 286.7/991.5 kB 930.9 kB/s eta 0:00:01
   -------------- ------------------------- 368.6/991.5 kB 1.0 MB/s eta 0:00:01
   ----------------- ---------------------- 430.1/991.5 kB 1.1 MB/s eta 0:00:01
   -------------------- ------------------- 512

In [3]:
import argparse
import bitsandbytes as bnb
from functools import partial
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import SFTTrainer
from transformers import LlamaForCausalLM
from transformers.models.llama.tokenization_llama import LlamaTokenizer

In [4]:
def load_model(model_name, bnb_config):
  n_gpus = torch.cuda.device_count()
  max_memory = f'{40960}MB'
  model = AutoModelForCausalLM.from_pretrained(

      model_name,
      quantization_config = bnb_config,
      device_map = "auto",
      max_memory = {i: max_memory for i in range(n_gpus)},
  )
  tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)

  tokenizer.pad_token = tokenizer.eos_token

  return model, tokenizer

In [5]:
dataset = load_dataset("json", data_files="cleaned_data.json")

Generating train split: 50352 examples [00:00, 83350.37 examples/s]


In [6]:

def create_prompt_formats(example):
    if example["input"]:
        return f"""Aşağıda daha çox kontekst təmin edən təlimat ilə yanaşı tapşırığı təsvir edən giriş var. Sorğunu adekvat şəkildə tamamlayan cavab yazın.
        ### Təlimat:
        {example["instruction"]}
        ### Giriş:
        {example["input"]}
        ### Cavab:
        {example["output"]}"""
    else:
        return f"""Aşağıda daha çox kontekst təmin edən təlimat var. Sorğunu adekvat şəkildə tamamlayan cavab yazın.
                ### Təlimat:
                {example["instruction"]}
                ### Cavab:
                {example["output"]}"""


In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 50352
    })
})


In [8]:
def tokenize(prompt):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )
    return {
        "input_ids": result["input_ids"][:-1],
        "attention_mask": result["attention_mask"][:-1],
    }

In [9]:
# dataset = dataset.shuffle().map(lambda x: tokenize(create_prompt_formats(x)))

In [10]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [11]:
def create_peft_config():
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

# ["q_proj", "v_proj"]

In [12]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [13]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [14]:
model_name = "aisquared/chopt-research-1_3b"

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

config.json: 100%|██████████| 952/952 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [18]:
CUTOFF_LEN = 256
val_set_size=2500


train_val = dataset['train'].train_test_split(
    test_size=val_set_size, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(lambda x: tokenize(create_prompt_formats(x))))

val_data = (
    train_val["test"].map(lambda x: tokenize(create_prompt_formats(x))))

Map:   0%|          | 0/47852 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [46]:
def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config()
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=300,
            save_steps=200,
            learning_rate=2e-4,
            eval_steps=200,
            evaluation_strategy="steps",
            fp16=True,
            num_train_epochs=3,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True

    # Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        eval_metrics=trainer.evaluate()

        trainer.log_metrics("eval", eval_metrics)
        trainer.save_metrics("eval", eval_metrics)

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    ###

    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()







In [47]:
output_dir = "results/chopt/final_checkpoint"
train(model, tokenizer, dataset, output_dir)

all params: 1,318,895,616 || trainable params: 3,145,728 || trainable%: 0.2385122796556479
torch.float32 1318895616 1.0
Training...




Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [26]:
# from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline

# # Define the path to your model directory
model_path = "results/chopt/final_checkpoint"

# # Load the configuration manually

# # Load the model with the specified configuration
model = AutoModelForCausalLM.from_pretrained(model_path)

# # Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create the pipeline with your model and tokenizer
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

# Define your custom prompt
instruction = "Təbiətin qorunması  "
formatted_prompt = f"""Aşağıda daha çox kontekst təmin edən təlimat var. Sorğunu adekvat şəkildə tamamlayan cavab yazın.
                ### Təlimat:
                {instruction}
                ### Cavab:
                """

# Use the pipeline with your custom prompt
result = pipe(formatted_prompt)
print(result[0]['generated_text'])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Aşağıda daha çox kontekst təmin edən təlimat var. Sorğunu adekvat şəkildə tamamlayan cavab yazın.
                ### Təlimat:
                Təbiətin qorunması  
                ### Cavab:
                 Təbiətin qorunması ən təlimatının ən qorunmasının ən qorunmasının ən


In [27]:
result

[{'generated_text': 'Aşağıda daha çox kontekst təmin edən təlimat var. Sorğunu adekvat şəkildə tamamlayan cavab yazın.\n                ### Təlimat:\n                Təbiətin qorunması  \n                ### Cavab:\n                 Təbiətin qorunması ən təlimatının ən qorunmasının ən qorunmasının ən'}]

In [None]:
# her defe run edende restart edin.
# max_steps hazirda 100-dur, artirib yoxlayin. (hazirda 0.01 epoch train edilib. Yeni 1 epochun 100-de biri)
# niye eyni sozler tekrarlanir? daha cox epochla hell oluna biler mi? - research
# datani basqa dillerde olan modellerde nece veribler (strukturu telimat/cavab kimi dogrudur mu) - research
# bu kod niye isleyir ? :))))) - funksiyalarda edilenler ne ucundur?
# datanin 10 faizini validationa ayirib yuxaridaki train loss-un yaninda validation lossu da cixarin

# kodlarin esas hissesi buradan goturmusem: https://www.linkedin.com/pulse/optimizing-llama-2-fine-tuning-google-colab-efficient-saibala-sundram-brjcc