# Libraries

In [1]:
# %pip install -U transformers
# %pip install -U datasets
# %pip install -U accelerate
# %pip install -U peft
# %pip install -U trl
# %pip install -U bitsandbytes

In [2]:
import os, torch, wandb

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

from datasets import load_dataset, concatenate_datasets
from trl import SFTTrainer, setup_chat_format
from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm


## Setup Huggingface ü§ó & Wandb

In [3]:
from huggingface_hub import login

login(token = "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")

wandb.login(key="ce84c3af2fdee6c3e2696b2a4ad96af49a3dd86e")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\USER_ELISEY\.cache\huggingface\token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mez1071[0m ([33mez1071-mipt[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\USER_ELISEY\_netrc


True

In [4]:
run = wandb.init(
    project='Fine-tune Llama 3.1 8B on Russian Dataset', 
    job_type="training"
)

In [5]:
@dataclass
class Config:
#     model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#     model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
    model_name = "notused"
    dataset_name = "notused"
    new_model = "russia_mini_chad"
    torch_dtype = torch.float16
    attn_implementation = "eager"
cfg = Config()

# Loading model and tokenizer

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=cfg.torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    "C:\\Users\\USER_ELISEY\\qwen",
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=cfg.attn_implementation
)

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\USER_ELISEY\\qwen")
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad|>'
print(len(tokenizer))

151646


## LoRA adapter

In [8]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

# Data

## Load

In [9]:
dataset = load_dataset('miracl/miracl', 'ru', trust_remote_code=True)
# data_eval["dev"]['negative_passages']

In [10]:
dataset

DatasetDict({
    dev: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
        num_rows: 1252
    })
    testB: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
        num_rows: 718
    })
    train: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
        num_rows: 4683
    })
    testA: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
        num_rows: 911
    })
})

## Format to chat 

In [11]:
def format_chat_template(row):
    row_json = [{"role": "user", "content": row["query"]},
               {"role": "assistant", "content": row["positive_passages"][0]["text"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [12]:
dataset = concatenate_datasets([
    dataset['dev'],
    dataset['train']
])

dataset = dataset.remove_columns('negative_passages')

In [13]:
dataset['positive_passages'][0][0]["text"]

'–ö–∞—Ä–∏ÃÅ–±—Å–∫–∏–π –∫—Ä–∏–∑–∏—Å\xa0‚Äî –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏–π —Ç–µ—Ä–º–∏–Ω, –æ–ø—Ä–µ–¥–µ–ª—è—é—â–∏–π —á—Ä–µ–∑–≤—ã—á–∞–π–Ω–æ –Ω–∞–ø—Ä—è–∂—ë–Ω–Ω–æ–µ –ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–æ–µ, –¥–∏–ø–ª–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ –∏ –≤–æ–µ–Ω–Ω–æ–µ –ø—Ä–æ—Ç–∏–≤–æ—Å—Ç–æ—è–Ω–∏–µ –º–µ–∂–¥—É –°–æ–≤–µ—Ç—Å–∫–∏–º –°–æ—é–∑–æ–º –∏ –°–æ–µ–¥–∏–Ω—ë–Ω–Ω—ã–º–∏ –®—Ç–∞—Ç–∞–º–∏ –≤ –æ–∫—Ç—è–±—Ä–µ 1962 –≥–æ–¥–∞, –∫–æ—Ç–æ—Ä–æ–µ –±—ã–ª–æ –≤—ã–∑–≤–∞–Ω–æ —Ä–∞–∑–º–µ—â–µ–Ω–∏–µ–º –°–®–ê —è–¥–µ—Ä–Ω–æ–≥–æ –æ—Ä—É–∂–∏—è –≤ –¢—É—Ä—Ü–∏–∏ –≤ 1961 –≥–æ–¥—É –∏ –≤–ø–æ—Å–ª–µ–¥—Å—Ç–≤–∏–∏ —Ç–∞–π–Ω–æ–π –ø–µ—Ä–µ–±—Ä–æ—Å–∫–æ–π –∏ —Ä–∞–∑–º–µ—â–µ–Ω–∏–µ–º –Ω–∞ –ö—É–±–µ –≤–æ–µ–Ω–Ω—ã—Ö —á–∞—Å—Ç–µ–π –∏ –ø–æ–¥—Ä–∞–∑–¥–µ–ª–µ–Ω–∏–π –í–æ–æ—Ä—É–∂—ë–Ω–Ω—ã—Ö –°–∏–ª –°–°–°–†, —Ç–µ—Ö–Ω–∏–∫–∏ –∏ –≤–æ–æ—Ä—É–∂–µ–Ω–∏—è, –≤–∫–ª—é—á–∞—è —è–¥–µ—Ä–Ω–æ–µ –æ—Ä—É–∂–∏–µ. –ö—Ä–∏–∑–∏—Å –º–æ–≥ –ø—Ä–∏–≤–µ—Å—Ç–∏ –∫ –≥–ª–æ–±–∞–ª—å–Ω–æ–π —è–¥–µ—Ä–Ω–æ–π –≤–æ–π–Ω–µ. –ö—É–±–∏–Ω—Ü—ã –Ω–∞–∑—ã–≤–∞—é—Ç –µ–≥–æ ¬´–û–∫—Ç—è–±—Ä—å—Å–∫–∏–º –∫—Ä–∏–∑–∏—Å–æ–º¬ª (), –≤ –°–®–ê —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω

In [14]:
dataset = dataset.map(
    format_chat_template,
    num_proc=1,
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5935/5935 [00:00<00:00, 6708.85 examples/s]


## Select only part

In [15]:
dataset_sh = dataset.shuffle(seed=2024)#.select(range(10_000))
dataset_sh

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'text'],
    num_rows: 5935
})

In [16]:
dataset_sh = dataset_sh.train_test_split(0.1)

In [17]:
dataset_sh

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'text'],
        num_rows: 5341
    })
    test: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'text'],
        num_rows: 594
    })
})

# Train model

## Training arguments

In [18]:
training_arguments = TrainingArguments(
    output_dir=cfg.new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
#     num_train_epochs=1,
    max_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=10,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    run_name="Llama-3.1-medicine",
)

In [19]:
print(len(tokenizer))

151646


In [20]:
# model.resize_token_embeddings(256000)

## Train model

In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_sh["train"],
    eval_dataset=dataset_sh["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5341/5341 [00:01<00:00, 5257.62 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 594/594 [00:00<00:00, 4960.29 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [22]:
trainer.train()

 10%|‚ñà         | 10/100 [00:06<00:52,  1.72it/s]

{'loss': 2.3897, 'grad_norm': 2.026089906692505, 'learning_rate': 0.0002, 'epoch': 0.0}


 20%|‚ñà‚ñà        | 20/100 [00:12<00:44,  1.79it/s]

{'loss': 2.0484, 'grad_norm': 2.2818949222564697, 'learning_rate': 0.00017777777777777779, 'epoch': 0.01}


 30%|‚ñà‚ñà‚ñà       | 30/100 [00:17<00:38,  1.80it/s]

{'loss': 1.9353, 'grad_norm': 2.067187786102295, 'learning_rate': 0.00015555555555555556, 'epoch': 0.01}


 40%|‚ñà‚ñà‚ñà‚ñà      | 40/100 [00:23<00:33,  1.81it/s]

{'loss': 1.9248, 'grad_norm': 2.976290702819824, 'learning_rate': 0.00013555555555555556, 'epoch': 0.01}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 50/100 [00:28<00:28,  1.78it/s]

{'loss': 1.721, 'grad_norm': 4.1801981925964355, 'learning_rate': 0.00011555555555555555, 'epoch': 0.02}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 60/100 [00:34<00:22,  1.74it/s]

{'loss': 2.1381, 'grad_norm': 1.54600989818573, 'learning_rate': 9.333333333333334e-05, 'epoch': 0.02}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 70/100 [00:40<00:16,  1.78it/s]

{'loss': 2.1478, 'grad_norm': 2.0367867946624756, 'learning_rate': 7.111111111111112e-05, 'epoch': 0.03}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 80/100 [00:45<00:11,  1.79it/s]

{'loss': 2.0301, 'grad_norm': 2.271479845046997, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.03}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 90/100 [00:51<00:05,  1.79it/s]

{'loss': 1.7802, 'grad_norm': 2.230801820755005, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.03}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:57<00:00,  1.78it/s]

{'loss': 1.7396, 'grad_norm': 3.252305507659912, 'learning_rate': 4.444444444444445e-06, 'epoch': 0.04}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:57<00:00,  1.74it/s]

{'train_runtime': 57.6382, 'train_samples_per_second': 3.47, 'train_steps_per_second': 1.735, 'train_loss': 1.985498580932617, 'epoch': 0.04}





TrainOutput(global_step=100, training_loss=1.985498580932617, metrics={'train_runtime': 57.6382, 'train_samples_per_second': 3.47, 'train_steps_per_second': 1.735, 'total_flos': 114330045434880.0, 'train_loss': 1.985498580932617, 'epoch': 0.03744617112900206})

In [23]:
path_to_save = "Llama-finetuned"
trainer.save_model(path_to_save)
model.save_pretrained(path_to_save)
tokenizer.save_pretrained(path_to_save)

('Llama-finetuned\\tokenizer_config.json',
 'Llama-finetuned\\special_tokens_map.json',
 'Llama-finetuned\\vocab.json',
 'Llama-finetuned\\merges.txt',
 'Llama-finetuned\\added_tokens.json',
 'Llama-finetuned\\tokenizer.json')

In [24]:
del model, tokenizer, trainer

# Compare models

## Init casual LLM

In [25]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=cfg.torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
casual_model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name,
    quantization_config=bnb_config,
#     device_map="auto",
    attn_implementation=cfg.attn_implementation
)

tokenizer = tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad_token|>'

OSError: notused is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
casual_model, tokenizer = setup_chat_format(casual_model, tokenizer)

## Get answers

In [None]:
def generate_answer(model, prompt):
    chat = [
        { "role": "user", "content": prompt },
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)

    return(tokenizer.decode(outputs[0]))

# Comprasion

In [None]:
q1 = "I have severe headaches help me please"
q2 = "I have a suspiciously large mole. Could I have cancer? How can I determine this at home?"
q3 = "What does abutment of the nerve root mean?"

In [None]:
generate_answer(model, q1)

In [None]:
generate_answer(model, q2)

In [None]:
generate_answer(model, q3)

In [None]:
# Free gpu memory
import numba
numba.cuda.close()

In [None]:
print(generate_answer(casual_model, q1))

In [None]:
generate_answer(casual_model, q2)

In [None]:
generate_answer(casual_model, q3)