In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from datamodule import WikiTextV2Datamodule
import os
from constants import TARGET_MODEL, DRAFT_MODEL
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
device = "cuda:4"

target_model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device)
draft_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
#target_model = AutoModelForCausalLM.from_pretrained(TARGET_MODEL, torch_dtype=torch.float16).to(device)
#draft_model = AutoModelForCausalLM.from_pretrained(DRAFT_MODEL, torch_dtype=torch.float16).to(device)
target_model.eval()
draft_model.train()

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

Предпосчитаем данные:

In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer

datamodule = WikiTextV2Datamodule(
    min_len=5,  
    max_len=12,
    target_model=target_model,
    target_model_tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL),
    device=device,
    batch_size=1, 
    check_cache=True
)
datamodule.setup(stage="fit")
train_loader = datamodule.train_dataloader()
optimizer = AdamW(
    [p for p in draft_model.parameters() if p.requires_grad],
    lr=0.001,
    weight_decay=0.001
)

Loaded preprocessed data from cache




Попробуем обучиться на 1 батче:

In [7]:
epochs = 5
batch = next(iter(train_loader))
for i in range(1):
    optimizer.zero_grad()
        
    input_ids = batch["input_ids"]
    target_logits = batch["logits"]
    
    draft_outputs = draft_model(input_ids)
    draft_logits = draft_outputs.logits
    print(draft_logits, target_logits)
    log_draft_probs = F.log_softmax(draft_logits, dim=-1)
    target_probs = F.softmax(target_logits, dim=-1)    
    print(log_draft_probs, target_probs)
    loss = F.kl_div(log_draft_probs, target_probs, reduction='batchmean')
    
    loss.backward()
    optimizer.step()
    
    """if (i + 1) % 5 == 0:
        print(loss.item()) """  

tensor([[[-3.2747, -3.2665,  4.2902,  ..., -3.2814, -5.0887, -3.2201],
         [-8.6742, -8.8899,  2.1745,  ..., -8.6357, -8.2513, -8.6272],
         [-7.4920, -7.6974,  3.1248,  ..., -7.4648, -7.3797, -7.4893],
         ...,
         [-9.1366, -9.3147,  4.7557,  ..., -9.1168, -8.8269, -9.1025],
         [-8.5012, -8.6815,  4.4104,  ..., -8.4676, -8.1237, -8.4944],
         [-6.2625, -6.3579,  5.4581,  ..., -6.3041, -7.0574, -6.2993]]],
       device='cuda:4', grad_fn=<UnsafeViewBackward0>) tensor([[[-0.5708, -0.9419,  6.0586,  ..., -0.4788, -1.1484, -0.8867],
         [-1.6680, -2.0547,  7.7188,  ..., -1.7725, -2.2930, -1.9961],
         [ 4.9375,  4.2891, 14.5547,  ...,  4.7461,  4.9141,  4.4375],
         ...,
         [ 5.6016,  5.1250, 19.6094,  ...,  5.5430,  5.4414,  5.4336],
         [ 3.8672,  3.7598,  9.8906,  ...,  3.9023,  4.1133,  4.0508],
         [ 2.9688,  2.8359, 11.7031,  ...,  3.1406,  2.8398,  2.9727]]],
       device='cuda:4', dtype=torch.float16)
tensor([[[-17.00

In [None]:
from finetune_draft_model import DraftModelFinetuner 
import lightning as L

datamodule = WikiTextV2Datamodule(
    min_len=5,  
    max_len=100,
    target_model=target_model,
    device=device,
    batch_size=1, 
)
datamodule.setup(stage="fit")
torch.set_float32_matmul_precision('medium')

trainer = L.Trainer(
    accelerator="gpu", max_epochs=1,
    limit_train_batches=None,
    logger=False, # TensorBoardLogger(save_dir=".")
    devices=[4] 
)

finetuner = DraftModelFinetuner()
trainer.fit(model=finetuner, datamodule=datamodule)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 4 has a total capacity of 39.50 GiB of which 15.19 MiB is free. Process 2571633 has 1022.00 MiB memory in use. Process 2909831 has 1.32 GiB memory in use. Process 2914831 has 1.32 GiB memory in use. Including non-PyTorch memory, this process has 35.81 GiB memory in use. Of the allocated memory 35.16 GiB is allocated by PyTorch, and 250.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.save(finetuner.state_dict(), 'model.pt')