In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2')

In [3]:
!pip install progress



In [4]:
!pip install loralib



In [5]:
!pip install torch==1.11.0



In [6]:
!pip install LAMA



In [7]:
from src.model import GPT2Config, GPT2LMModel
from p_tune.modeling import PTuneForLAMA
from data_utils.dataset import load_file, LAMADataset
from data_utils.vocab import init_vocab

In [8]:
!pip install opacus==0.15.0



In [9]:
import argparse
import time
import math
import os, sys
import warnings

import numpy as np
import itertools

import torch
import torch.nn.functional as F
import random
from torch.utils.data import DataLoader
torch.set_printoptions(threshold=100000)
from opacus import PrivacyEngine
from opacus.grad_sample import utils as opacus_utils
from opacus.layers import DifferentiallyPrivateDistributedDataParallel as DPDDP

In [61]:
from src.gpu import (
    add_gpu_params,
    parse_gpu,
    distributed_opt,
    distributed_gather,
    distributed_sync,
    cleanup
)
from src.optimizer import (
    create_adam_optimizer,
    create_optimizer_scheduler,
    add_optimizer_params,
    create_adam_optimizer_from_args
)

from src.data_utils import FT_Dataset
from src.model import GPT2Config, GPT2LMModel
from src.exp_utils import create_exp_dir

from loralib import MergedLinear
import loralib as lora

parser = argparse.ArgumentParser(description='PyTorch GPT2 ft script')

add_gpu_params(parser)
add_optimizer_params(parser)
device = torch.device('cuda')


class Args:
    train_data = '/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/data/e2e/train.jsonl'  # You need to set this to your actual data path
    valid_data = '/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/data/e2e/valid.jsonl'  # You need to set this to your actual data path
    train_batch_size = 8
    valid_batch_size = 4
    grad_acc = 1
    clip = 0.0
    noise_multiplier = 0.5
    max_grad_norm = 1.0
    seq_len = 512
    model_card = 'gpt2.sm'
    init_checkpoint = '/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/pretrained_checkpoints/gpt2-pytorch_model.bin'
    fp16 = False
    log_interval = 100
    eval_interval = 2000
    save_interval = 500
    work_dir = os.getenv('PT_OUTPUT_DIR', 'gpt2_model')
    lora_dim = 0
    lora_alpha = 128
    obj = 'clm'
    lora_dropout = 0.0
    label_smooth = 0.0
    roll_interval = -1
    roll_lr = 0.00001
    roll_step = 100
    eval_epoch = 1
    random_seed = 42

    lr = 0.00001
    weight_decay = 0.01
    correct_bias = False  # default for 'store_true' is False
    adam_epislon = 1e-6
    no_decay_bias = False  # default for 'store_true' is False, but we keep the original value
    adam_beta1 = 0.9
    adam_beta2 = 0.98
    scheduler = 'linear'
    max_step = None
    max_epoch = 5
    warmup_step = 0
    i_steps = '0'
    i_lrs = '0.00025'
    platform = 'local'
    world_size = 1
    device = device
    rank = 0

args = Args()


class AverageMeter(object):
    """Computes and stores the average and current value
         Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def optimizer_step(_loss, _optimizer, _model, _schedule, args, is_update=True):
    #if args.fp16:
    #    with amp.scale_loss(_loss, _optimizer) as _scaled_loss:
    #        _scaled_loss.backward()
    #else:
    _loss.backward()

    if is_update:
        if args.clip > 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(_optimizer), args.clip)
            else:
                torch.nn.utils.clip_grad_norm_(_model.parameters(), args.clip)

        _optimizer.step()
        _optimizer.zero_grad()
    else:
        # This should fail if we are using any gradient accumulation with Opacus DDP
        _optimizer.virtual_step()

    if _schedule is not None:
        _schedule.step()


def evaluate(model, valid_loader, args):
    model.eval()
    total_loss = 0.
    start_time = time.time()

    avg_lm_loss = AverageMeter()

    with torch.no_grad():
        for idx, data in enumerate(valid_loader):
            data = {key: value for key, value in data.items()}

            _input = data['input'].to(args.device)
            _target = data['target'].to(args.device)
            _msk = data['mask'].to(args.device)

            _lm_logits, _loss = model(_input, lm_labels=_target, lm_mask=_msk)
            loss = _loss.mean()

            avg_lm_loss.update(loss.item())

            if idx % 100 == 0:
                print('eval samples:', idx, 'loss:', loss.float())

        total_time = time.time() - start_time
        print('average loss', avg_lm_loss.avg)
    return avg_lm_loss.avg, math.exp(avg_lm_loss.avg)

def train_validate(
    model,
    optimizer,
    scheduler,
    train_loader,
    valid_loader,
    args,
    train_step=0,
    epoch=0
):
    model.train()
    avg_lm_loss = AverageMeter()
    print('start to train the model................', epoch)
    log_start_time = time.time()
    best_val_ppl = None

    #train_loader.sampler.set_epoch(epoch)


    for idx, data in enumerate(train_loader):
        data = {key: value for key, value in data.items()}

        _input = data['input'].to(args.device)
        _target = data['target'].to(args.device)
        _msk = data['mask'].to(args.device)

        _lm_logits, _lm_loss = model(
            _input, lm_labels=_target, lm_mask=_msk, label_smooth=args.label_smooth
        )

        _lm_loss = _lm_loss.mean()

        train_step += 1
        is_update = True if train_step % args.grad_acc == 0 else False
        avg_lm_loss.update(_lm_loss.item())
        # optimizer_step(
        #     _lm_loss/(args.grad_acc), optimizer, model, scheduler, args, is_update=is_update
        # )
        optimizer_step(
            _lm_loss, optimizer, model, scheduler, args, is_update=is_update
        )

        if train_step % args.log_interval == 0:
            elapsed = time.time() - log_start_time
            lr = optimizer.param_groups[0]['lr']
            log_str = f'| epoch {epoch:3d} step {train_step:>8d} | { idx + 1:>6d} batches | ' \
                      f'lr {lr:.3g} | ms/batch {elapsed * 1000 / args.log_interval:5.2f} | ' \
                      f'loss {avg_lm_loss.val:5.2f} | avg loss {avg_lm_loss.avg:5.2f} | ' \
                      f'ppl {math.exp(avg_lm_loss.avg):5.2f}'

            if args.rank == 0:
                print(log_str)
            log_start_time = time.time()
            avg_lm_loss.reset()

        if train_step % args.save_interval == 0:
            if args.rank == 0:
                model_path = os.path.join(args.work_dir, f'model.{train_step}.pt')
                print('saving checkpoint', model_path)
                torch.save({'model_state_dict': lora.lora_state_dict(model)}, model_path)
            distributed_sync(args)

        # evaluation interval
        if train_step % args.eval_interval == 0:
            eval_start_time = time.time()

            valid_loss, valid_ppl = evaluate(model, valid_loader, args)

            if best_val_ppl is None or valid_ppl < best_val_ppl:
                best_val_ppl = valid_ppl

            log_str = f'| Eval {train_step // args.eval_interval:3d} at step {train_step:>8d} | ' \
                      f'time: {time.time() - eval_start_time:5.2f}s | valid loss {valid_loss:5.2f} | ' \
                      f'valid ppl {valid_ppl:5.2f} | best ppl {best_val_ppl:5.2f} '

            if args.rank == 0:
                print('-' * 100)
                print(log_str)
                print('-' * 100)

            model.train()
            distributed_sync(args)

        if train_step == args.max_step:
            break

    if args.rank == 0:
        model_path = os.path.join(args.work_dir, f'model.{train_step}.pt')
        print('saving checkpoint', model_path)
        torch.save({'model_state_dict': model.state_dict()}, model_path)
    distributed_sync(args)
    return train_step

In [62]:
def reverse_zero_pad(x, W, enable_lora, out_features):
    lora_ind = W.new_zeros((out_features, ), dtype=torch.bool).view(len(enable_lora), -1)
    lora_ind[enable_lora, :] = True
    lora_ind = lora_ind.view(-1)
    result = x.new_zeros((*x.shape[:-1], out_features // len(enable_lora) * sum(enable_lora)))
    result = result.view(-1, out_features // len(enable_lora) * sum(enable_lora))
    result = x.reshape(-1, out_features)[:, lora_ind]
    return result.view((*x.shape[:-1], out_features // len(enable_lora) * sum(enable_lora)))


def compute_transformers_MergedLinear_grad_sample(layer: MergedLinear, A: torch.Tensor, B: torch.Tensor, batch_dim: int = 0) -> None:
    delta1 = reverse_zero_pad(B, layer.weight, layer.enable_lora, layer.out_features) * (layer.lora_alpha / layer.r)
    after_A = F.linear(layer.lora_dropout(A), layer.lora_A)
    t_after_A = after_A.transpose(-2, -1)
    in_channel = t_after_A.shape[1]
    out_channel = delta1.shape[-1]
    lora_b_channel = layer.lora_B.shape[0]

    gs1 = torch.einsum("nik,nkj->nij", t_after_A[:, :in_channel//2, :], delta1[:, :, :out_channel//2])
    gs2 = torch.einsum("nik,nkj->nij", t_after_A[:, in_channel//2:, :], delta1[:, :, out_channel//2:])
    opacus_utils.create_or_extend_grad_sample(layer.lora_B, torch.cat((gs1, gs2), -1).transpose(-2,-1).contiguous(), batch_dim)
    gs3 = torch.einsum("nik,kj->nij", delta1[:, :, :out_channel//2], layer.lora_B[:lora_b_channel//2, :])
    gs4 = torch.einsum("nik,kj->nij", delta1[:, :, out_channel//2:], layer.lora_B[lora_b_channel//2:, :])
    after_A_deriv = torch.cat((gs3, gs4), -1)
    lora_A_deriv = torch.einsum("nki,nkj->nij", after_A_deriv, layer.lora_dropout(A))
    opacus_utils.create_or_extend_grad_sample(layer.lora_A, lora_A_deriv.contiguous(), batch_dim)

In [54]:
    torch.manual_seed(args.random_seed)
    random.seed(args.random_seed)


    train_data = FT_Dataset(
        args.train_data, args.train_batch_size, args.seq_len,
        joint_lm=args.obj=='jlm'
    )

    valid_data = FT_Dataset(
        args.valid_data, args.valid_batch_size, args.seq_len,
    )

    train_loader = DataLoader(
        train_data, batch_size=args.train_batch_size, num_workers=0,
        shuffle=True, pin_memory=False, drop_last=True
    )

    valid_loader = DataLoader(
        valid_data, batch_size=args.valid_batch_size, num_workers=0,
        shuffle=False, pin_memory=False, drop_last=False
    )

    if args.model_card == 'gpt2.sm':
        config = GPT2Config(
            n_embd=768, n_layer=12, n_head=12,
            lora_attn_dim=args.lora_dim,
            lora_attn_alpha=args.lora_alpha,
            lora_dropout=args.lora_dropout,
        )
    elif args.model_card == 'gpt2.md':
        config = GPT2Config(
            n_embd=1024, n_layer=24, n_head=16,
            lora_attn_dim=args.lora_dim,
            lora_attn_alpha=args.lora_alpha,
            lora_dropout=args.lora_dropout,
        )



In [55]:
 ###################
lm_net = GPT2LMModel(config)
#lm_net = SoftPromptTuning(lm_net, prompt_length)
if args.init_checkpoint is not None:
  print('loading model pretrained weight.')
  lm_net.load_weight(torch.load(args.init_checkpoint))

#for param in lm_net.pretrained_model.parameters():
#    param.requires_grad = False
#for param in lm_net.prompt_embeddings.parameters():
#    param.requires_grad = True

loading model pretrained weight.


In [56]:
lm_net = lm_net.cuda()
if args.lora_dim > 0:
    lora.mark_only_lora_as_trainable(lm_net)
opacus_utils.register_grad_sampler(MergedLinear)(compute_transformers_MergedLinear_grad_sample)
#lm_net = DPDDP(lm_net)

<function __main__.compute_transformers_MergedLinear_grad_sample(layer: loralib.layers.MergedLinear, A: torch.Tensor, B: torch.Tensor, batch_dim: int = 0) -> None>

In [57]:
############################
optimizer = create_adam_optimizer_from_args(lm_net, args)


In [58]:
if args.max_step is None:

    args.max_step = (args.max_epoch * train_data.num_batches + args.world_size - 1) // args.world_size
    print('set max_step:', args.max_step)

scheduler = create_optimizer_scheduler(optimizer, args)
if args.fp16:
    lm_net, optimizer = amp.initialize(lm_net, optimizer, opt_level="O1")

n_layers = len([(n, p) for n, p in lm_net.named_parameters() if p.requires_grad])
max_grad_norm = [args.max_grad_norm / np.sqrt(n_layers)] * n_layers

ALPHAS = [1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64))

set max_step: 52580


In [59]:
##################################
#num_trainable_params = sum(p.numel() for p in lm_net.prompt_embeddings.parameters() if p.requires_grad)
ALPHAS = [1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64))
    # We instead use the accountant from Gopi et al. (2021) as described in the paper.
SAMPLE_RATE = (args.train_batch_size * args.grad_acc)/42061.0
privacy_engine = PrivacyEngine(
    module=lm_net,
    sample_rate=SAMPLE_RATE,
    alphas=ALPHAS,
    noise_multiplier=args.noise_multiplier,
    max_grad_norm=max_grad_norm,
)
privacy_engine.attach(optimizer)


delta = 1.0/42061 # We instead use the accountant from Gopi et al. (2021) as described in the paper.




IncompatibleModuleException: Model contains incompatible modules.
grad_sampler method is not yet supported for this module.: ['_module.transformer.h.0.ln_1 (LayerNorm)', '_module.transformer.h.0.attn.c_proj (Conv1D)', '_module.transformer.h.0.ln_2 (LayerNorm)', '_module.transformer.h.0.mlp.c_fc (Conv1D)', '_module.transformer.h.0.mlp.c_proj (Conv1D)', '_module.transformer.h.1.ln_1 (LayerNorm)', '_module.transformer.h.1.attn.c_proj (Conv1D)', '_module.transformer.h.1.ln_2 (LayerNorm)', '_module.transformer.h.1.mlp.c_fc (Conv1D)', '_module.transformer.h.1.mlp.c_proj (Conv1D)', '_module.transformer.h.2.ln_1 (LayerNorm)', '_module.transformer.h.2.attn.c_proj (Conv1D)', '_module.transformer.h.2.ln_2 (LayerNorm)', '_module.transformer.h.2.mlp.c_fc (Conv1D)', '_module.transformer.h.2.mlp.c_proj (Conv1D)', '_module.transformer.h.3.ln_1 (LayerNorm)', '_module.transformer.h.3.attn.c_proj (Conv1D)', '_module.transformer.h.3.ln_2 (LayerNorm)', '_module.transformer.h.3.mlp.c_fc (Conv1D)', '_module.transformer.h.3.mlp.c_proj (Conv1D)', '_module.transformer.h.4.ln_1 (LayerNorm)', '_module.transformer.h.4.attn.c_proj (Conv1D)', '_module.transformer.h.4.ln_2 (LayerNorm)', '_module.transformer.h.4.mlp.c_fc (Conv1D)', '_module.transformer.h.4.mlp.c_proj (Conv1D)', '_module.transformer.h.5.ln_1 (LayerNorm)', '_module.transformer.h.5.attn.c_proj (Conv1D)', '_module.transformer.h.5.ln_2 (LayerNorm)', '_module.transformer.h.5.mlp.c_fc (Conv1D)', '_module.transformer.h.5.mlp.c_proj (Conv1D)', '_module.transformer.h.6.ln_1 (LayerNorm)', '_module.transformer.h.6.attn.c_proj (Conv1D)', '_module.transformer.h.6.ln_2 (LayerNorm)', '_module.transformer.h.6.mlp.c_fc (Conv1D)', '_module.transformer.h.6.mlp.c_proj (Conv1D)', '_module.transformer.h.7.ln_1 (LayerNorm)', '_module.transformer.h.7.attn.c_proj (Conv1D)', '_module.transformer.h.7.ln_2 (LayerNorm)', '_module.transformer.h.7.mlp.c_fc (Conv1D)', '_module.transformer.h.7.mlp.c_proj (Conv1D)', '_module.transformer.h.8.ln_1 (LayerNorm)', '_module.transformer.h.8.attn.c_proj (Conv1D)', '_module.transformer.h.8.ln_2 (LayerNorm)', '_module.transformer.h.8.mlp.c_fc (Conv1D)', '_module.transformer.h.8.mlp.c_proj (Conv1D)', '_module.transformer.h.9.ln_1 (LayerNorm)', '_module.transformer.h.9.attn.c_proj (Conv1D)', '_module.transformer.h.9.ln_2 (LayerNorm)', '_module.transformer.h.9.mlp.c_fc (Conv1D)', '_module.transformer.h.9.mlp.c_proj (Conv1D)', '_module.transformer.h.10.ln_1 (LayerNorm)', '_module.transformer.h.10.attn.c_proj (Conv1D)', '_module.transformer.h.10.ln_2 (LayerNorm)', '_module.transformer.h.10.mlp.c_fc (Conv1D)', '_module.transformer.h.10.mlp.c_proj (Conv1D)', '_module.transformer.h.11.ln_1 (LayerNorm)', '_module.transformer.h.11.attn.c_proj (Conv1D)', '_module.transformer.h.11.ln_2 (LayerNorm)', '_module.transformer.h.11.mlp.c_fc (Conv1D)', '_module.transformer.h.11.mlp.c_proj (Conv1D)', '_module.transformer.ln_f (LayerNorm)']

In [63]:
try:
    train_step = 0
    for epoch in itertools.count(start=1):
        train_step = train_validate(
            lm_net, optimizer, scheduler, train_loader, valid_loader, args,
            train_step=train_step, epoch=epoch
        )

        # Printing epsilon from opacus privacy engine at the end of each epoch
        eps, alpha = optimizer.privacy_engine.get_privacy_spent(delta)
        print("End of epoch {}, we have epsilon {} for alpha {}".format(epoch, eps, alpha))

        if train_step >= args.max_step or (args.max_epoch is not None and epoch >= args.max_epoch):
            if args.rank == 0:
                print('-' * 100)
                print('End of training')
            break
except KeyboardInterrupt:
    if args.rank == 0:
        print('-' * 100)
        print('Exiting from training early')

distributed_sync(args)
print('cleanup dist ...')
cleanup(args)

start to train the model................ 1


RuntimeError: CUDA out of memory. Tried to allocate 1.15 GiB (GPU 0; 15.77 GiB total capacity; 12.94 GiB already allocated; 356.38 MiB free; 14.22 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF