In [1]:
import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-24 14:52:51,806] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


: 

: 

In [1]:
from torch.utils.data import DataLoader, Dataset

import torch 

class RandomIntDataset(Dataset):
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
    
    def __len__(self):
        return 10000

    def __getitem__(self, idx):
        return {"input_ids": torch.randint(0, self.vocab_size, (1,))}

def create_dataloader(vocab_size, batch_size=8):
    return DataLoader(RandomIntDataset(vocab_size), batch_size=batch_size)

dataloader = create_dataloader(32000)
for batch in dataloader:
    print({k: v.shape for k, v in batch.items()})
    break

{'input_ids': torch.Size([8, 1])}


In [2]:
from accelerate import Accelerator

import datasets
import transformers
from tqdm.auto import tqdm
from transformers import (
    AdamW,
    get_cosine_schedule_with_warmup,
    set_seed,
)
from torch.optim import AdamW


hyperparameters = {
    "learning_rate": 2e-5,
    "num_epochs": 3,
    "steps_per_epoch": 100,
    "validation_steps": 50,
    "batch_size": 8, # Actual batch size will this x 8
    "seed": 42,
    "vocab_size": 32000,
}

def training_loop(model):
    
    accelerator = Accelerator()
    
    # To have only one message (and not 8) per logs of Transformers or Datasets, we set the logging verbosity
    # to INFO for the main process only.
    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
        
    dataloader = create_dataloader(hyperparameters["vocab_size"], hyperparameters["batch_size"])
    
    set_seed(hyperparameters["seed"])
    
    optimizer = AdamW(model.parameters(), lr=hyperparameters["learning_rate"])
    
    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, dataloader = accelerator.prepare(
        model, optimizer, dataloader
    )
    
    num_epochs = hyperparameters["num_epochs"]

    # Instantiate learning rate scheduler after preparing the training dataloader as the prepare method
    # may change its length.
    lr_scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=hyperparameters["steps_per_epoch"] * num_epochs,
    )
    progress_bar = tqdm(range(num_epochs * hyperparameters["steps_per_epoch"]), disable=not accelerator.is_main_process)

    for epoch in range(num_epochs):
        model.train()
        model.lm_head.requires_grad_(False)
        model.model.requires_grad_(False)
        model.auxiliary_outputs.requires_grad_(True)
        batch = next(iter(dataloader))
        for step in range(hyperparameters["steps_per_epoch"]):
            outputs = model(batch)
            loss = outputs.loss
            lm_head_logits = outputs.logits[-1]
            accelerator.backward(loss)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            
            batch = torch.cat((batch, torch.multinomial(torch.softmax(lm_head_logits[:, -1, :], dim=-1), 1)), dim=-1)
            
        model.eval()
        batch = next(iter(dataloader))
        eval_loss = 0
        for step in range(hyperparameters["validation_steps"]):
            outputs = model(batch)
            eval_loss += outputs.loss
        loss = eval_loss / hyperparameters["validation_steps"]
        
        accelerator.print(f"Epoch {epoch} loss: {loss.item()}")
            

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-24 16:06:10,690] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
from src.branchymodel import BranchyLlama

branchyllamaconf = BranchyLlama.config_class.from_pretrained(
    "openlm-research/open_llama_3b_v2"
)
branchyllamaconf.self_supervision = True

model = BranchyLlama.from_pretrained(
    "openlm-research/open_llama_3b_v2", config=branchyllamaconf
)



Some weights of BranchyLlama were not initialized from the model checkpoint at openlm-research/open_llama_3b_v2 and are newly initialized: ['auxiliary_outputs.1.weight', 'auxiliary_outputs.4.weight', 'auxiliary_outputs.24.weight', 'auxiliary_outputs.21.weight', 'auxiliary_outputs.23.weight', 'auxiliary_outputs.16.weight', 'auxiliary_outputs.17.weight', 'auxiliary_outputs.14.weight', 'auxiliary_outputs.6.weight', 'auxiliary_outputs.2.weight', 'auxiliary_outputs.19.weight', 'auxiliary_outputs.3.weight', 'auxiliary_outputs.11.weight', 'auxiliary_outputs.0.weight', 'auxiliary_outputs.10.weight', 'auxiliary_outputs.9.weight', 'auxiliary_outputs.7.weight', 'auxiliary_outputs.15.weight', 'auxiliary_outputs.12.weight', 'auxiliary_outputs.18.weight', 'auxiliary_outputs.22.weight', 'auxiliary_outputs.5.weight', 'auxiliary_outputs.25.weight', 'auxiliary_outputs.13.weight', 'auxiliary_outputs.20.weight', 'auxiliary_outputs.8.weight']
You should probably TRAIN this model on a down-stream task to be

In [4]:
from accelerate import notebook_launcher

notebook_launcher(training_loop, (model,), num_processes=2)

Launching training on 2 GPUs.


RuntimeError: CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic.