# Libraries

In [1]:
# %%capture

# %pip install -U peft
# %pip install -U trl
# %pip install -U bitsandbytes 

In [2]:
# %pip install kaggle

In [3]:
# !git clone https://github.com/Kaggle/docker-python.git

In [4]:
# import sys
# sys.path.append("./docker-python/patches")

In [5]:
# !mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [6]:
# %pip install git+https://github.com/Kaggle/kaggle-secrets.git


In [7]:
import os, torch, wandb

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format
from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm


## Setup Huggingface 🤗 & Wandb

In [8]:
from huggingface_hub import login

login(token = "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")

wandb.login(key="ce84c3af2fdee6c3e2696b2a4ad96af49a3dd86e")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\USER_ELISEY\.cache\huggingface\token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mez1071[0m ([33mez1071-mipt[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\USER_ELISEY\_netrc


True

In [9]:
run = wandb.init(
    project='Fine-tune Llama 3.1 8B on Russian Dataset', 
    job_type="training"
)

In [10]:
@dataclass
class Config:
#     model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#     model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
    model_name = "google/gemma-2-9b-it"
    dataset_name = "ruslanmv/ai-medical-chatbot"
    new_model = "model_weights"
    torch_dtype = torch.float16
    attn_implementation = "eager"
cfg = Config()

# Loading model and tokenizer

In [11]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=cfg.torch_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "C:\\Users\\USER_ELISEY\\gemma",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation=cfg.attn_implementation
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.36s/it]


In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\USER_ELISEY\\gemma")
model, tokenizer = setup_chat_format(model, tokenizer)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad|>'

## LoRA adapter

In [13]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

# Data

## Load

In [14]:
dataset = load_dataset(cfg.dataset_name, split="all")
print(dataset)

Dataset({
    features: ['Description', 'Patient', 'Doctor'],
    num_rows: 256916
})


## Format to chat 

In [15]:
def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Patient"]},
               {"role": "assistant", "content": row["Doctor"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [16]:
dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/256916 [00:01<?, ? examples/s]


NameError: name 'tokenizer' is not defined

## Select only part

In [23]:
dataset_sh = dataset.shuffle(seed=2024).select(range(10_000))

In [24]:
dataset_sh = dataset_sh.train_test_split(0.1)
dataset_sh

Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 9543918
})
Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 955
}) Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 9543918
})


# Train model

## Training arguments

In [25]:
training_arguments = TrainingArguments(
    output_dir=cfg.new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    max_steps=200,
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    group_by_length=True,
    report_to="wandb",
    run_name="Llama-3.1-medicine",
)

## Train model

In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_sh["train"],
    eval_dataset=dataset_sh["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

  5%|▌         | 10/200 [06:25<1:54:53, 36.28s/it]

{'loss': 2.1514, 'grad_norm': 2.3777191638946533, 'learning_rate': 0.0001, 'epoch': 0.0}


 10%|█         | 20/200 [10:25<1:09:27, 23.15s/it]

{'loss': 2.1486, 'grad_norm': 3.3271751403808594, 'learning_rate': 9.473684210526316e-05, 'epoch': 0.0}


 15%|█▌        | 30/200 [13:55<57:58, 20.46s/it]  

{'loss': 2.17, 'grad_norm': 4.141357898712158, 'learning_rate': 8.947368421052632e-05, 'epoch': 0.0}


 20%|██        | 40/200 [16:49<46:02, 17.27s/it]

{'loss': 2.2011, 'grad_norm': 5.0713605880737305, 'learning_rate': 8.421052631578948e-05, 'epoch': 0.0}


 25%|██▌       | 50/200 [19:50<42:02, 16.82s/it]

{'loss': 3.7403, 'grad_norm': 21.20669937133789, 'learning_rate': 7.894736842105263e-05, 'epoch': 0.0}


 30%|███       | 60/200 [26:08<1:25:54, 36.82s/it]

{'loss': 2.0939, 'grad_norm': 1.9018141031265259, 'learning_rate': 7.368421052631579e-05, 'epoch': 0.0}


 32%|███▎      | 65/200 [28:48<1:08:41, 30.53s/it]

In [1]:
path_to_save = "Llama-finetuned"
trainer.save_model(path_to_save)
model.save_pretrained(path_to_save)
tokenizer.save_pretrained(path_to_save)

NameError: name 'trainer' is not defined