In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#used these versions since they where used by the differential private finetuning code
!pip install opacus==0.15.0
!pip install torch==1.11.0
!pip install datasets
!pip install loralib



In [11]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, get_scheduler, DataCollatorForLanguageModeling
from opacus import PrivacyEngine
from datasets import load_dataset
#######################
import os
from os.path import join, abspath, dirname
from data_utils.dataset import load_file, LAMADataset
from data_utils.vocab import init_vocab
from p_tune.modeling import PTuneForLAMA
from transformers import AutoTokenizer
#############################
from loralib import MergedLinear
import loralib as lora
from opacus.grad_sample import utils as opacus_utils
from tqdm import tqdm

In [12]:
#functions used in this code
def get_task_name():
        if args.only_evaluate:
            return "_".join([args.model_name + ('_' + args.vocab_strategy), 'only_evaluate'])
        names = [args.model_name + ('_' + args.vocab_strategy),
                 "template_{}".format(args.template if not args.use_original_template else 'original'),
                 "fixed" if not args.use_lm_finetune else "fine-tuned",
                 "seed_{}".format(args.seed)]
        return "_".join(names)

def get_TREx_parameters():
        relation = load_file(join(args.data_dir, "single_relations/{}.jsonl".format(args.relation_id)))[0]
        data_path_pre = "fact-retrieval/original/{}/".format(args.relation_id)
        data_path_post = ".jsonl"
        return relation, data_path_pre, data_path_post
def get_save_path():
        return join(args.out_dir, 'prompt_model', args.model_name, 'search', get_task_name(),
                    args.relation_id)

def reverse_zero_pad(x, W, enable_lora, out_features):
    lora_ind = W.new_zeros((out_features, ), dtype=torch.bool).view(len(enable_lora), -1)
    lora_ind[enable_lora, :] = True
    lora_ind = lora_ind.view(-1)
    result = x.new_zeros((*x.shape[:-1], out_features // len(enable_lora) * sum(enable_lora)))
    result = result.view(-1, out_features // len(enable_lora) * sum(enable_lora))
    result = x.reshape(-1, out_features)[:, lora_ind]
    return result.view((*x.shape[:-1], out_features // len(enable_lora) * sum(enable_lora)))


def compute_transformers_MergedLinear_grad_sample(layer: MergedLinear, A: torch.Tensor, B: torch.Tensor, batch_dim: int = 0) -> None:
    delta1 = reverse_zero_pad(B, layer.weight, layer.enable_lora, layer.out_features) * layer.scaling
    after_A = F.linear(layer.lora_dropout(A), layer.lora_A)
    t_after_A = after_A.transpose(-2, -1)
    in_channel = t_after_A.shape[1]
    out_channel = delta1.shape[-1]
    lora_b_channel = layer.lora_B.shape[0]

    gs1 = torch.einsum("nik,nkj->nij", t_after_A[:, :in_channel//2, :], delta1[:, :, :out_channel//2])
    gs2 = torch.einsum("nik,nkj->nij", t_after_A[:, in_channel//2:, :], delta1[:, :, out_channel//2:])
    opacus_utils.create_or_extend_grad_sample(layer.lora_B, torch.cat((gs1, gs2), -1).transpose(-2,-1).contiguous(), batch_dim)
    gs3 = torch.einsum("nik,kj->nij", delta1[:, :, :out_channel//2], layer.lora_B[:lora_b_channel//2, :])
    gs4 = torch.einsum("nik,kj->nij", delta1[:, :, out_channel//2:], layer.lora_B[lora_b_channel//2:, :])
    after_A_deriv = torch.cat((gs3, gs4), -1)
    lora_A_deriv = torch.einsum("nki,nkj->nij", after_A_deriv, layer.lora_dropout(A))
    opacus_utils.create_or_extend_grad_sample(layer.lora_A, lora_A_deriv.contiguous(), batch_dim)

def evaluate(epoch_idx, evaluate_type):
        model.eval()
        if evaluate_type == 'Test':
            loader = test_loader
            dataset = test_set
        else:
            loader = dev_loader
            dataset = dev_set
        with torch.no_grad():
            model.eval()
            hit1, loss = 0, 0
            for x_hs, x_ts in loader:
                if False and self.args.extend_data:
                    _loss, _hit1 = self.model.test_extend_data(x_hs, x_ts)
                elif evaluate_type == 'Test':
                    _loss, _hit1, top10 = model(x_hs, x_ts, return_candidates=True)
                else:
                    _loss, _hit1 = model(x_hs, x_ts)
                hit1 += _hit1
                loss += _loss.item()
            hit1 /= len(dataset)
            print("{} {} Epoch {} Loss: {} Hit@1:".format(args.relation_id, evaluate_type, epoch_idx,
                                                          loss / len(dataset)), hit1)
        return loss, hit1

In [6]:
class Args:
    init_checkpoint = "/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/pretrained_checkpoints/gpt2-pytorch_model.bin"
    learning_rate = 2e-5
    train_batch_size = 8
    grad_acc_steps = 1
    epochs = 3
    noise_multiplier = 1.0
    max_grad_norm = 1.0
    lstm_dropout = 0.1
    hidden_size = 768
    max_length = 128

    relation_id = "P1001"
    model_name = 'gpt2'
    pseudo_token = '[PROMPT]'

    t5_shard = 0
    mid = 0
    template = (3, 3, 3)
    early_stop = 20

    lr = 1e-5
    seed = 34
    decay_rate = 0.98
    weight_decay = 0.0005
    no_cuda = False
    seq_len = 512

    train_data = "/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/data/e2e/train.jsonl"
    valid_data = "/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/data/e2e/valid.jsonl"


    only_evaluate = False
    use_original_template = False
    use_lm_finetune = False

    vocab_strategy = "shared"

    # directories
    data_dir = '/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/data/LAMA'
    out_dir = '/content/drive/MyDrive/DP-soft-prompts/Differentially-Private-Fine-tuning-of-Language-Models-main/Language-Generation-GPT-2/out/LAMA'


    lora_dim = 4
    lora_alpha = 32
    lora_dropout = 0.0
    label_smooth = 0.1

    device = torch.device("cuda")
    n_gpu = 0 if no_cuda else torch.cuda.device_count()

    assert isinstance(template, tuple)

args = Args()

In [7]:
#tokenizer init
tokenizer = AutoTokenizer.from_pretrained('gpt2', use_fast=False)
relation, data_path_pre, data_path_post = get_TREx_parameters()
init_vocab(args)
#data processing
train_data = load_file(join(args.data_dir, data_path_pre + 'train' + data_path_post))
dev_data = load_file(join(args.data_dir, data_path_pre + 'dev' + data_path_post))
test_data = load_file(join(args.data_dir, data_path_pre + 'test' + data_path_post))

test_set = LAMADataset('test', test_data, tokenizer, args)
train_set = LAMADataset('train', train_data, tokenizer, args)
dev_set = LAMADataset('dev', dev_data, tokenizer, args)
os.makedirs(get_save_path(), exist_ok=True)

train_loader = DataLoader(train_set, batch_size=8, shuffle=True, drop_last=True)
dev_loader = DataLoader(dev_set, batch_size=8)
test_loader = DataLoader(test_set, batch_size=8)


In [8]:
model = PTuneForLAMA(args, args.device, args.template)

loading model pretrained weight.
init prompt encoder...


In [9]:
if args.lora_dim > 0:
      lora.mark_only_lora_as_trainable(model.model)
opacus_utils.register_grad_sampler(MergedLinear)(compute_transformers_MergedLinear_grad_sample)

<function __main__.compute_transformers_MergedLinear_grad_sample(layer: loralib.layers.MergedLinear, A: torch.Tensor, B: torch.Tensor, batch_dim: int = 0) -> None>

In [13]:
params_to_optimize = []
for name, param in model.named_parameters():
    if name == "prompt_encoder.embedding.weight":
            param.requires_grad = False
    if param.requires_grad:
            params_to_optimize.append({'params': param})
            #print(name)

In [17]:
optimizer = torch.optim.Adam(params_to_optimize, lr=args.lr, weight_decay=args.weight_decay)

In [15]:
#attaching the privacy engine
ALPHAS = [1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64))
SAMPLE_RATE = (args.train_batch_size * args.grad_acc_steps)/42061.0
privacy_engine = PrivacyEngine(
    module=model,
    sample_rate=SAMPLE_RATE,
    alphas=ALPHAS,
    noise_multiplier=args.noise_multiplier,
    max_grad_norm=args.max_grad_norm,
)
privacy_engine.attach(optimizer)



In [18]:
#check if privacy engine is attached - for debug
if hasattr(optimizer, "privacy_engine"):
    print("PrivacyEngine is attached.")
    is_attached = isinstance(optimizer.privacy_engine, PrivacyEngine)
    print(f"PrivacyEngine is correctly attached: {is_attached}")
else:
    print("PrivacyEngine is NOT attached.")

PrivacyEngine is NOT attached.


In [20]:
#training loop
best_dev, early_stop, has_adjusted = 0, 0, True
for epoch_idx in range(100):
    if epoch_idx > -1:
        dev_loss, dev_hit1 = evaluate(epoch_idx, 'Dev')
        if epoch_idx == 0:
            test_loss, test_hit1 = evaluate(epoch_idx, 'Test')
        if epoch_idx > 0 and (dev_hit1 >= best_dev) or args.only_evaluate:
            test_loss, test_hit1 = evaluate(epoch_idx, 'Test')
            #best_ckpt = self.get_checkpoint(epoch_idx, dev_hit1, test_hit1)
            early_stop = 0
            best_dev = dev_hit1
        else:
            early_stop += 1
            if early_stop >= args.early_stop:
                #self.save(best_ckpt)
                print("{} Early stopping at epoch {}.".format(args.relation_id, epoch_idx))
                break
    if args.only_evaluate:
        break

        # run training
    hit1, num_of_samples = 0, 0
    tot_loss = 0
    for batch_idx, batch in tqdm(enumerate(train_loader)):
        optimizer.zero_grad()
        model.train()
        loss, batch_hit1 = model(batch[0], batch[1])
        hit1 += batch_hit1
        tot_loss += loss.item()
        num_of_samples += len(batch[0])


        loss.backward()

        torch.cuda.empty_cache()
        optimizer.step()
        torch.cuda.empty_cache()


P1001 Dev Epoch 0 Loss: 0.16870941986908783 Hit@1: 0.6702702702702703
P1001 Test Epoch 0 Loss: 0.10551043590867376 Hit@1: 0.8102409638554217


0it [00:00, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 4.44 GiB (GPU 0; 14.75 GiB total capacity; 9.31 GiB already allocated; 4.44 GiB free; 9.35 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF