# Libraries

In [1]:
# %%capture

# %pip install -U peft
# %pip install -U trl
# %pip install -U bitsandbytes 

In [2]:
# %pip install kaggle

In [3]:
# !git clone https://github.com/Kaggle/docker-python.git

In [4]:
# import sys
# sys.path.append("./docker-python/patches")

In [5]:
# !mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [6]:
# %pip install git+https://github.com/Kaggle/kaggle-secrets.git


In [7]:
import os, torch, wandb

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format
from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm


## Setup Huggingface ü§ó & Wandb

In [8]:
from huggingface_hub import login

login(token = "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")

wandb.login(key="ce84c3af2fdee6c3e2696b2a4ad96af49a3dd86e")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\USER_ELISEY\.cache\huggingface\token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mez1071[0m ([33mez1071-mipt[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\USER_ELISEY\_netrc


True

In [9]:
run = wandb.init(
    project='Fine-tune Qwen 0.5B on Russian Dataset', 
    job_type="training"
)

In [10]:
@dataclass
class Config:
#     model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#     model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
    model_name = "Qwen/Qwen2-0.5B"
    dataset_name = "C:\\Users\\USER_ELISEY\\miracl_"
    new_model = "model_weights"
    torch_dtype = torch.float16
    attn_implementation = "eager"
cfg = Config()

# Loading model and tokenizer

In [11]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=cfg.torch_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "C:\\Users\\USER_ELISEY\\qwen",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation=cfg.attn_implementation
)

In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\USER_ELISEY\\qwen")
model, tokenizer = setup_chat_format(model, tokenizer)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad|>'

## LoRA adapter

In [13]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

# Data

## Load

In [14]:
dataset = load_dataset('miracl/miracl-corpus', 'ru', trust_remote_code=True)

In [15]:
data_eval = load_dataset('miracl/miracl', 'ru', trust_remote_code=True)

In [16]:
# data_eval['train']['query']

## Format to chat 

In [17]:
def format_chat_template(row):
    row_json = [{"role": "user", "content": row["USER"]},
               {"role": "assistant", "content": row["CHAT"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [18]:
# dataset = dataset.map(
#     format_chat_template,
#     num_proc=4,
# )

## Select only part

In [19]:
# dataset_sh = dataset.shuffle(seed=2024).select(range(10_000))

In [20]:
#dataset_sh = dataset.train_test_split(0.1)
# dataset_sh
print(dataset['train'])
dataset_t = dataset['train'].select(range(0, 9543918, 100))
dataset_e = dataset['train'].select(range(1, 9543918, 500))
print(dataset_e, dataset_t)

Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 9543918
})
Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 19088
}) Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 95440
})


# Train model

## Training arguments

In [21]:
training_arguments = TrainingArguments(
    output_dir=cfg.new_model,
    per_device_train_batch_size=1,  # –£–º–µ–Ω—å—à–µ–Ω–∏–µ —Ä–∞–∑–º–µ—Ä–∞ –±–∞—Ç—á–∞ –¥–ª—è —ç–∫–æ–Ω–æ–º–∏–∏ –ø–∞–º—è—Ç–∏
    per_device_eval_batch_size=1,  # –ê–Ω–∞–ª–æ–≥–∏—á–Ω–æ —É–º–µ–Ω—å—à–∞–µ–º –¥–ª—è –æ—Ü–µ–Ω–∫–∏
    gradient_accumulation_steps=8,  # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º, —á—Ç–æ–±—ã –∫–æ–º–ø–µ–Ω—Å–∏—Ä–æ–≤–∞—Ç—å —É–º–µ–Ω—å—à–µ–Ω–∏–µ —Ä–∞–∑–º–µ—Ä–∞ –±–∞—Ç—á–∞
    optim="adamw_torch",  # –û—Å—Ç–∞–≤–ª—è–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä num_train_epochs=3,  # –û—Å—Ç–∞–≤–ª—è–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö –¥–ª—è –±–æ–ª–µ–µ –ø–æ–ª–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
    max_steps=-1,  # –ò—Å–ø–æ–ª—å–∑—É–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö –≤–º–µ—Å—Ç–æ —Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö —à–∞–≥–æ–≤
    eval_strategy="steps",  # –û—Å—Ç–∞–≤–ª—è–µ–º —Å—Ç—Ä–∞—Ç–µ–≥–∏—é "steps" –¥–ª—è —á–∞—Å—Ç–æ–π –æ—Ü–µ–Ω–∫–∏ eval_steps=500,  # –®–∞–≥–∏ –¥–ª—è –æ—Ü–µ–Ω–∫–∏ logging_steps=50,  # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –∫–∞–∂–¥—ã–µ 50 —à–∞–≥–æ–≤
    warmup_steps=100,  # –û—Å—Ç–∞–≤–ª—è–µ–º —Ä–∞–∑–æ–≥—Ä–µ–≤ –¥–ª—è –ø–ª–∞–≤–Ω–æ–≥–æ —Å—Ç–∞—Ä—Ç–∞ learning_rate=3e-5,  # –û—Å—Ç–∞–≤–ª—è–µ–º –Ω–µ–±–æ–ª—å—à—É—é —Å–∫–æ—Ä–æ—Å—Ç—å –æ–±—É—á–µ–Ω–∏—è –¥–ª—è —Å—Ç–∞–±–∏–ª—å–Ω–æ—Å—Ç–∏ fp16=True,  # –ò—Å–ø–æ–ª—å–∑—É–µ–º fp16 –¥–ª—è —É–º–µ–Ω—å—à–µ–Ω–∏—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –ø–∞–º—è—Ç–∏
    fp16=True,
    bf16=False,  # –û—Ç–∫–ª—é—á–∞–µ–º bf16
    group_by_length=True,
    report_to="wandb",  # –û—Å—Ç–∞–≤–ª—è–µ–º –¥–ª—è –∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏–∏ —Å Weights & Biases run_name="qwen-rus-memory-optimized",
    run_name="qwen-rus",
)


In [22]:
print(len(tokenizer))

151646


In [23]:
model.resize_token_embeddings(151936) # –¥–ª—è —Ç–æ–≥–æ, —á—Ç–æ–±—ã –º–æ–¥–µ–ª—å –∑–∞–ø—É—Å–∫–∞–ª–∞—Å—å –ø–∞–π–ø–ª–∞–π–Ω–æ–º –∏—Å—Ö–æ–¥–Ω–æ–π –º–æ–¥–µ–ª–∏
# Fixme
# –í–ù–ò–ú–ê–ù–ò–ï! –ü–æ–ø—Ä–æ–±–æ–≤–∞—Ç—å —É–±—Ä–∞—Ç—å –∏ –Ω–∞–ø–∏—Å–∞—Ç—å –ø—Ä–∏ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏ –º–æ–¥–µ–ª–∏. –õ–∏–±–æ —Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å –º–Ω–µ –∫–∞–∫ —Ä–∞–±–æ—Ç–∞–µ—Ç –∏ –ø–æ—á–µ–º—É –Ω–µ –ª–æ–º–∞–µ—Ç –≤–µ—Å–∞.

Embedding(151936, 896)

## Train model

In [24]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_t,
    eval_dataset=dataset_e,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
trainer.train()

  1%|‚ñè         | 500/35790 [18:11<21:28:17,  2.19s/it]

{'loss': 2.7661, 'grad_norm': 28.785703659057617, 'learning_rate': 4.9445222751471e-05, 'epoch': 0.04}




In [26]:
path_to_save = "qwen-finetuned"
trainer.save_model(path_to_save)
model.save_pretrained(path_to_save)
tokenizer.save_pretrained(path_to_save)

('qwen-finetuned\\tokenizer_config.json',
 'qwen-finetuned\\special_tokens_map.json',
 'qwen-finetuned\\vocab.json',
 'qwen-finetuned\\merges.txt',
 'qwen-finetuned\\added_tokens.json',
 'qwen-finetuned\\tokenizer.json')