In [None]:
from transformers import AutoModelForCausalLM
from datamodule import WikiTextV2Datamodule
import os
from constants import TARGET_MODEL, DRAFT_MODEL, CUDA_DEVICE

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

target_model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(CUDA_DEVICE)
draft_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(CUDA_DEVICE)
#target_model = AutoModelForCausalLM.from_pretrained(TARGET_MODEL, torch_dtype=torch.float16).to(CUDA_DEVICE)
#draft_model = AutoModelForCausalLM.from_pretrained(DRAFT_MODEL, torch_dtype=torch.float16).to(CUDA_DEVICE)
target_model.eval()
draft_model.train()

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

Предпосчитаем данные:

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer

datamodule = WikiTextV2Datamodule(
    min_len=5,  
    max_len=12,
    target_model=target_model,
    target_model_tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL),
    device=CUDA_DEVICE,
    batch_size=1, 
    check_cache=True
)
datamodule.setup(stage="fit")
train_loader = datamodule.train_dataloader()
optimizer = torch.optim.AdamW(
    [p for p in draft_model.parameters() if p.requires_grad],
    lr=0.001,
    weight_decay=0.001
)

Loaded preprocessed data from cache


Попробуем обучиться на 1 батче:

In [None]:
epochs = 5
batch = next(iter(train_loader))
for i in range(50):
    optimizer.zero_grad()
        
    input_ids = batch["input_ids"].to(CUDA_DEVICE)
    target_logits = batch["logits"].to(CUDA_DEVICE)
    
    draft_outputs = draft_model(input_ids)
    draft_logits = draft_outputs.logits
    log_draft_probs = F.log_softmax(draft_logits, dim=-1)
    target_probs = F.softmax(target_logits, dim=-1)    
    loss = F.kl_div(log_draft_probs, target_probs, reduction='batchmean')
    
    loss.backward()
    optimizer.step()
    
    if (i + 1) % 5 == 0:
        print(loss.item())

53.531211853027344
18.476091384887695
11.422229766845703
7.8118696212768555
5.378449440002441
4.74157190322876
3.686624526977539
3.0815353393554688
2.5667619705200195
2.7591421604156494


In [None]:
from finetune_draft_model import DraftModelFinetuner 
import lightning as L

datamodule = WikiTextV2Datamodule(
    min_len=5,  
    max_len=100,
    target_model=target_model,
    device=CUDA_DEVICE,
    batch_size=1, 
)
datamodule.setup(stage="fit")
torch.set_float32_matmul_precision('medium')

trainer = L.Trainer(
    accelerator="gpu", max_epochs=1,
    limit_train_batches=None,
    logger=False, # TensorBoardLogger(save_dir=".")
    devices=[4] 
)

finetuner = DraftModelFinetuner()
trainer.fit(model=finetuner, datamodule=datamodule)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 4 has a total capacity of 39.50 GiB of which 15.19 MiB is free. Process 2571633 has 1022.00 MiB memory in use. Process 2909831 has 1.32 GiB memory in use. Process 2914831 has 1.32 GiB memory in use. Including non-PyTorch memory, this process has 35.81 GiB memory in use. Of the allocated memory 35.16 GiB is allocated by PyTorch, and 250.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.save(finetuner.state_dict(), 'model.pt')