In [1]:
#imports
import os
from accelerate import Accelerator, notebook_launcher
import torch
#from parallelformers import parallelize

from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, OPTForCausalLM, Trainer, TrainingArguments
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
from utils.save_utils import load_masked_model, load_masked_model_single
from utils.prehook_utils import remove_all_hooks

In [2]:
def training_step():
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    #accelerate = Accelerator()
    
    def encode_tok(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length')

    #print(torch.cuda.is_initialized())
    model_name='opt-1.3b'
    EPOCH_COUNT=10
    SPARSITY=0.3
    tokenizer = AutoTokenizer.from_pretrained(f'facebook/{model_name}', padding_side='left', model_max_length=1024)
    #print(torch.cuda.is_initialized())
    #stream c4, training split
    training_data = load_dataset('c4', 'en', split='train', streaming=True)
    #training_data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train', streaming=True)
    #IMPORTANT: process data while streaming -> remove unnecessary columns in batches
    training_data = training_data.map(encode_tok, 
                                      batched=True,
                                      remove_columns=["text", "timestamp", "url"])
    #set data to tensor mode
    training_data = training_data.with_format("torch")

    #dataloader from dataloader (mlm=False when training without mask)
    dataloader = DataLoader(training_data, 
                            collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False),
                            batch_size=1, num_workers=16, pin_memory=True)
    #print(torch.cuda.is_initialized())
    loaded_model = OPTForCausalLM.from_pretrained(f'facebook/{model_name}',
                                                      output_attentions=True,
                                                      output_hidden_states=True).to(device=device)
    #print(torch.cuda.is_initialized())
    if SPARSITY != 1:
        load_masked_model_single(loaded_model, f'pruned_models/{model_name}-{SPARSITY}.pt')
    loaded_model = torch.nn.DataParallel(loaded_model, device_ids=[0,1,2,3])
    #loaded_model = parallelize(loaded_model, num_gpus=4, fp16=True, verbose='detail')
    loaded_model.train()
    
    FROZEN_LIMIT = 18
    for n, p in loaded_model.named_parameters():
        split_name = n.split('layers.')
        if len(split_name) > 1 and int(split_name[-1].split('.')[0]) < FROZEN_LIMIT:
            if p.requires_grad:
                p.requires_grad = False
    #print(torch.cuda.is_initialized())
    t_optim = torch.optim.AdamW(params=loaded_model.parameters(), lr=1e-5)


    #loaded_model, t_optim, dataloader = accelerate.prepare(loaded_model, t_optim, dataloader)
    !nvidia-smi
    t_optim = torch.optim.AdamW(params=loaded_model.parameters(), lr=1e-5)
    #print(torch.cuda.is_initialized())
    for epoch in tqdm(range(EPOCH_COUNT)):
        training_data.set_epoch(epoch)
        for i, batch in enumerate(dataloader):
            #print('TRAINING')
            !nvidia-smi
            #print(batch)
            if i == 5:
                break
            #batch = {k: torch.tensor(v) for k, v in batch.items()}
            #print(batch)
            #print(batch['input_ids'].size())
            #print(batch['attention_mask '].size())
            outputs = loaded_model(**batch)
            loss = outputs.loss
            print(loss)
            #accelerate.backward(loss)
            loss.backward()
            t_optim.step()
            t_optim.zero_grad()
    torch.save(loaded_model.state_dict(), f'pruned_models/{model_name}-{SPARSITY}-finetuned.pt')

In [3]:
training_step()



Sun Feb 26 17:22:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   30C    P0    56W / 250W |  17333MiB / 40960MiB |     42%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   31C    P0    37W / 250W |      2MiB / 40960MiB |      0%      Default |
|       

  0%|          | 0/10 [00:00<?, ?it/s]

Sun Feb 26 17:24:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   29C    P0    35W / 250W |  17333MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   31C    P0    37W / 250W |      2MiB / 40960MiB |      0%      Default |
|       



tensor([6.5384], device='cuda:0', grad_fn=<GatherBackward>)
Sun Feb 26 17:24:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   33C    P0    54W / 250W |  38161MiB / 40960MiB |    100%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   31C    P0    40W / 2

  0%|          | 0/10 [01:16<?, ?it/s]


In [3]:
notebook_launcher(training_step, args=(), num_processes=4, mixed_precision='bf16')

Launching training on 4 GPUs.
