In [1]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

'''Train GPT2 model series with DP (w/ parameter-efficient approach LoRA when lora_dim > 0)'''

import datasets
import dp_transformers
import transformers
import sys
import logging

from dataclasses import dataclass, field
from transformers.training_args import ParallelMode
from dp_transformers.layers.dp_merged_linear import mark_only_lora_as_trainable
from dp_transformers.module_modification import convert_gpt2_attention_to_lora

In [2]:
logger = logging.getLogger(__name__)

In [3]:
@dataclass
class ModelArguments:
    model_name: str = field(default="gpt2", metadata={
        "help": "Model name in HuggingFace, e.g. 'gpt2'"
    })

    lora_dim: int = field(default=0, metadata={
        "help": "LoRA dimension; 0 means LoRA is disabled"
    })

    sequence_len: int = field(default=128, metadata={
        "help": "Model sequence length"
    })

    lora_dropout: float = field(default=0.0, metadata={
        "help": "Dropout probability for LoRA layers"
    })

    lora_alpha: int = field(default=32, metadata={
        "help": "LoRA attention alpha"
    })

In [4]:
@dataclass
class Arguments:
    train: dp_transformers.TrainingArguments
    privacy: dp_transformers.PrivacyArguments
    model: ModelArguments

In [6]:
import argparse

def parse_arguments(argv):
    parser = argparse.ArgumentParser(description='Model Training Arguments')
    
    # 添加参数
    parser.add_argument('--output_dir', type=str, default='scratch')
    parser.add_argument('--model_name', type=str, default='gpt2')
    parser.add_argument('--sequence_len', type=int, default=128)
    parser.add_argument('--per_device_train_batch_size', type=int, default=32)
    parser.add_argument('--gradient_accumulation_steps', type=int, default=2)
    parser.add_argument('--evaluation_strategy', type=str, default='steps')
    parser.add_argument('--eval_steps', type=int, default=45)
    parser.add_argument('--log_level', type=str, default='info')
    parser.add_argument('--per_device_eval_batch_size', type=int, default=64)
    parser.add_argument('--eval_accumulation_steps', type=int, default=1)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--target_epsilon', type=int, default=8)
    parser.add_argument('--per_sample_max_grad_norm', type=float, default=1.0)
    parser.add_argument('--prediction_loss_only', action='store_true')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--remove_unused_columns', type=bool, default=False)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--logging_steps', type=int, default=5)
    parser.add_argument('--max_grad_norm', type=int, default=0)
    parser.add_argument('--lr_scheduler_type', type=str, default='constant')
    parser.add_argument('--learning_rate', type=float, default=1e-4)
    parser.add_argument('--disable_tqdm', type=bool, default=False)
    parser.add_argument('--dataloader_num_workers', type=int, default=2)
    
    args = parser.parse_args(argv)
    return args

# 解析参数
args = parse_arguments(['--dataloader_num_workers','2'])
# args = parse_arguments()

# 打印解析的参数
print(args.model_name)
print(args.sequence_len)
print(args.per_device_train_batch_size)
# 其他参数类似...


gpt2
128
32


In [None]:
output_dir='scratch',
model_name='gpt2',
sequence_len='128',

per_device_train_batch_size=32,
gradient_accumulation_steps=2,
evaluation_strategy='steps',
eval_steps=45,
log_level='info',
per_device_eval_batch_size=64,
eval_accumulation_steps=1,
seed=42,
num_train_epochs=3,
logging_steps=5,
max_grad_norm=0,
lr_scheduler_type='constant',
learning_rate=1e-4,
disable_tqdm=False,
dataloader_num_workers=2


--target_epsilon 8 \
--per_sample_max_grad_norm 1.0 \
--prediction_loss_only \
--weight_decay 0.01 \
--remove_unused_columns False \
--num_train_epochs 3 \
--logging_steps 5 \
--max_grad_norm 0 \
--lr_scheduler_type constant \
--learning_rate 1e-4 \
--disable_tqdm False \
--dataloader_num_workers 2

In [30]:
from dp_transformers import TrainingArguments, PrivacyArguments

# 创建TrainingArguments对象
train_args = TrainingArguments(
    output_dir='scratch',
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    evaluation_strategy='steps',
    eval_steps=45,
    log_level='info',
    per_device_eval_batch_size=64,
    eval_accumulation_steps=1,
    seed=42,
    num_train_epochs=3,
    logging_steps=5,
    max_grad_norm=0,
    lr_scheduler_type='constant',
    learning_rate=1e-4,
    disable_tqdm=False,
    dataloader_num_workers=2,
    prediction_loss_only=True
)

# 创建PrivacyArguments对象
privacy_args = PrivacyArguments(
    target_epsilon=8,
    per_sample_max_grad_norm=1.0,
    
    
)

# 创建ModelArguments对象
model_args = ModelArguments(
    model_name='gpt2',
    sequence_len='128',
    
)


In [33]:
transformers.set_seed(args.seed)

In [32]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

In [31]:
log_level = train_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

In [35]:
# Log on each process the small summary:
logger.warning(
    f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
    f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
)



In [37]:
logger.info(f"Training/evaluation parameters {train_args}")

08/17/2023 20:50:15:INFO:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=2,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
dry_run=False,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=45,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_i

In [38]:
logger.info(f"Privacy parameters {privacy_args}")

08/17/2023 20:50:22:INFO:Privacy parameters PrivacyArguments(per_sample_max_grad_norm=1.0, noise_multiplier=None, target_epsilon=8, target_delta=None, disable_dp=False)


In [40]:
# Load model
model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name)
model = model.to('cuda')

[INFO|configuration_utils.py:668] 2023-08-17 20:51:10,737 >> loading configuration file config.json from cache at /kewei-ai/huggingface/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
[INFO|configuration_utils.py:720] 2023-08-17 20:51:10,738 >> Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_

In [44]:
# Load data
dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=train_args.seed)

HEAD request to https://huggingface.co/datasets/reddit/resolve/main/README.md timed out, retrying... [1.0]
08/17/2023 20:53:16:INFO:HEAD request to https://huggingface.co/datasets/reddit/resolve/main/README.md timed out, retrying... [1.0]
Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/reddit/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e
08/17/2023 20:53:20:INFO:Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/reddit/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e
Overwrite dataset info from restored data version if exists.
08/17/2023 20:53:20:INFO:Overwrite dataset info from restored data version if exists.
Loading Dataset info from /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e
08/17/2023 20:53:20:INFO:Loading Dataset info from /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329

In [45]:
# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name)
tokenizer.pad_token = -100 # Set a dummy pad token we don't use it anyway

[INFO|tokenization_auto.py:502] 2023-08-17 20:53:42,666 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
[INFO|configuration_utils.py:668] 2023-08-17 20:53:43,126 >> loading configuration file config.json from cache at /kewei-ai/huggingface/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
[INFO|configuration_utils.py:720] 2023-08-17 20:53:43,127 >> Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activatio

In [48]:
model_args.sequence_len

'128'

In [49]:
# Tokenize data
with train_args.main_process_first(desc="tokenizing dataset"):
    dataset = dataset.map(
        lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=int(model_args.sequence_len)),
        batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=dataset.column_names['train']
    )

Process #0 will write at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00000_of_00008.arrow
08/17/2023 20:54:52:INFO:Process #0 will write at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00000_of_00008.arrow
Process #1 will write at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00001_of_00008.arrow
08/17/2023 20:54:52:INFO:Process #1 will write at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00001_of_00008.arrow
Process #2 will write at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00002_of_00008.arrow
08/17

tokenizing dataset (num_proc=8):   0%|          | 0/490000 [00:00<?, ? examples/s]

Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00004_of_00008.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00003_of_00008.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00001_of_00008.arrow
08/17/2023 20:54:58:INFO:Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00003_of_00008.arrow
08/17/2023 20:54:57:INFO:Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-4c149cb9494486ab_00004_

tokenizing dataset (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-30641ec52f976fc7_00006_of_00008.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-30641ec52f976fc7_00003_of_00008.arrow
08/17/2023 21:00:18:INFO:Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-30641ec52f976fc7_00006_of_00008.arrow
08/17/2023 21:00:18:INFO:Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-30641ec52f976fc7_00003_of_00008.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/reddit/default/1.0.0/bd1bf9097540c9101f329c123d12c6c6a042f65e5f0ab7f9bbabb0a54d3c840e/cache-30641ec52f976fc7_00002_

In [None]:
--lora_dim 4 \
--lora_alpha 32 \
--lora_dropout 0.0 \

In [51]:
model_args.lora_dim=4

In [52]:
model_args.lora_alpha

32

In [54]:
model_args.lora_dropout

0.0

In [57]:
args.model=model_args

In [58]:
if args.model.lora_dim > 0:
    model = convert_gpt2_attention_to_lora(
        model, r=args.model.lora_dim, lora_alpha=args.model.lora_alpha, lora_dropout=args.model.lora_dropout,
        enable_lora=[True, False, True], merge_weights=False
    )
    mark_only_lora_as_trainable(model)



In [59]:
if train_args.local_rank == 0:
    logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
    logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")

model = model.cuda()
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): DPMergedLinear(
            (linear): Linear(in_features=768, out_features=2304, bias=True)
            (lora_A): Linear(in_features=768, out_features=8, bias=False)
            (lora_B): Conv1DZeroInit()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elem

In [60]:
if args.model.lora_dim > 0:
    from dp_transformers.grad_sample.lora import lora_layer
else:
    from dp_transformers.grad_sample.transformers import conv_1d

In [61]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': -100}, clean_up_tokenization_spaces=True)

In [62]:
data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)

In [63]:
data_collator

DataCollatorForPrivateCausalLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': -100}, clean_up_tokenization_spaces=True), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [64]:
trainer = dp_transformers.dp_utils.OpacusDPTrainer(
    args=train_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    privacy_args=privacy_args,
)

In [65]:
try:
    trainer.train()
finally:
    eps_prv = trainer.get_prv_epsilon()
    eps_rdp = trainer.get_rdp_epsilon()
    trainer.log({
        "final_epsilon_prv": eps_prv,
        "final_epsilon_rdp": eps_rdp
    })

[INFO|trainer.py:762] 2023-08-17 22:43:40,796 >> The following columns in the training set don't have a corresponding argument in `GradSampleModule.forward` and have been ignored: attention_mask, input_ids. If attention_mask, input_ids are not expected by `GradSampleModule.forward`,  you can safely ignore this message.
[INFO|trainer.py:1769] 2023-08-17 22:43:40,817 >> ***** Running training *****
[INFO|trainer.py:1770] 2023-08-17 22:43:40,818 >>   Num examples = 0
[INFO|trainer.py:1771] 2023-08-17 22:43:40,818 >>   Num Epochs = 3
[INFO|trainer.py:1772] 2023-08-17 22:43:40,819 >>   Instantaneous batch size per device = 32
[INFO|trainer.py:1773] 2023-08-17 22:43:40,819 >>   Total train batch size (w. parallel, distributed & accumulation) = 64
[INFO|trainer.py:1774] 2023-08-17 22:43:40,819 >>   Gradient Accumulation steps = 2
[INFO|trainer.py:1775] 2023-08-17 22:43:40,820 >>   Total optimization steps = 22,968
[INFO|trainer.py:1776] 2023-08-17 22:43:40,821 >>   Number of trainable paramet

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
           ^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 2807, in __getitems__
    batch = self.__getitem__(keys)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 2803, in __getitem__
    return self._getitem(key)
           ^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 2787, in _getitem
    pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/datasets/formatting/formatting.py", line 583, in query_table
    _check_valid_index_key(key, size)
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/datasets/formatting/formatting.py", line 536, in _check_valid_index_key
    _check_valid_index_key(int(max(key)), size=size)
  File "/root/anaconda3/envs/LLM/lib/python3.11/site-packages/datasets/formatting/formatting.py", line 526, in _check_valid_index_key
    raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
IndexError: Invalid key: 419297 is out of bounds for size 0
