In [1]:
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModel
from fastapi.middleware.cors import CORSMiddleware
import uvicorn, json, datetime
import torch
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import Dataset

In [2]:
# %pip install peft

In [3]:
from dp_transformers import TrainingArguments

In [4]:
import dp_transformers

In [5]:
import json

In [6]:
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE

origins = [
    "http://localhost.tiangolo.com",
    "https://localhost.tiangolo.com",
    "http://localhost",
    "http://localhost:8080",
    "http://localhost:5500",
    "http://120.55.72.74",
    "http://www.aivirtuallover.com",
    "https://www.aivirtuallover.com",
    "http://aivirtuallover.com",
    "https://aivirtuallover.com",
]

In [7]:
def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()




def load_lora_config(model):
    config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["query_key_value"]
    )
    return get_peft_model(model, config)

PROMPT_PATTERN = "问：{}"
SEP_PATTERN = "\n答： "
def create_prompt(question):
    return PROMPT_PATTERN.format(question), SEP_PATTERN


def create_prompt_ids(tokenizer, question, max_src_length):
    prompt, sep = create_prompt(question)
    sep_ids = tokenizer.encode(
        sep, 
        add_special_tokens = True
    )
    sep_len = len(sep_ids)
    special_tokens_num = 2
    prompt_ids = tokenizer.encode(
        prompt, 
        max_length = max_src_length - (sep_len - special_tokens_num),
        truncation = True,
        add_special_tokens = False
    )

    return prompt_ids + sep_ids


def create_inputs_and_labels(tokenizer, question, answer, device):
    prompt = create_prompt_ids(tokenizer, question, max_src_length)
    completion = tokenizer.encode(
        answer, 
        max_length = max_dst_length,
        truncation = True,
        add_special_tokens = False
    )

    inputs = prompt + completion + [eop]
    labels = [-100] * len(prompt) + completion + [eop] 
    
    inputs = torch.tensor(inputs, dtype=torch.long, device=device)
    labels = torch.tensor(labels, dtype=torch.long, device=device)
    return inputs, labels

def get_attention_mask(tokenizer, input_ids, device):
    seq = input_ids.tolist()
    context_len = seq.index(bos)
    seq_len = len(seq)
    attention_mask = torch.ones((seq_len, seq_len), device=device)
    attention_mask.tril_()
    attention_mask[..., :context_len] = 1
    attention_mask.unsqueeze_(0)
    attention_mask = (attention_mask < 0.5).bool()
    return attention_mask


def get_position_ids(tokenizer, input_ids, device, position_encoding_2d=True):
    seq = input_ids.tolist()
    context_len = seq.index(bos)
    seq_len = len(seq)

    mask_token = mask if mask in seq else gmask
    use_gmask = False if mask in seq else gmask

    mask_position = seq.index(mask_token)

    if position_encoding_2d:
        position_ids = torch.arange(seq_len, dtype=torch.long, device=device)
        if not use_gmask:
            position_ids[context_len:] = mask_position
        block_position_ids = torch.cat((
            torch.zeros(context_len, dtype=torch.long, device=device),
            torch.arange(seq_len - context_len, dtype=torch.long, device=device) + 1
        ))
        position_ids = torch.stack((position_ids, block_position_ids), dim=0)
    else:
        position_ids = torch.arange(seq_len, dtype=torch.long, device=device)
        if not use_gmask:
            position_ids[context_len:] = mask_position
    
    return position_ids

class QADataset(Dataset):
    def __init__(self, data, tokenizer) -> None:
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
 

    def __getitem__(self, index):
        item_data = self.data[index]
        tokenizer = self.tokenizer
        input_ids, labels = create_inputs_and_labels(
            tokenizer, 
            device=device,
            **item_data
        )
        
        attention_mask = get_attention_mask(tokenizer, input_ids, device)
        position_ids = get_position_ids(tokenizer, input_ids, device)

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "position_ids": position_ids
        }
        

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    input_ids = []
    attention_mask = []
    labels = []
    position_ids = []
    
    for obj in batch:
        input_ids.append(obj['input_ids'])
        labels.append(obj['labels'])
        attention_mask.append(obj['attention_mask'])
        position_ids.append(obj['position_ids'])
        
    return {
        'input_ids': torch.stack(input_ids),
        'attention_mask': torch.stack(attention_mask), 
        'labels': torch.stack(labels),
        'position_ids':torch.stack(position_ids)
    }

class ModifiedTrainer(dp_transformers.dp_utils.OpacusDPTrainer):

    def compute_loss(self, model, inputs, return_outputs=False, train=True):
        model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            position_ids=inputs["position_ids"],
            labels=inputs["labels"],
        ).loss.backward()
        return model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            position_ids=inputs["position_ids"],
            labels=inputs["labels"],
        ).loss


In [8]:
MODEL="/data1/ckw/kewei_models/chatglm-6b-096"
revision = "096f3de6b4959ce38bef7bb05f3129c931a3084e"    
tokenizer = AutoTokenizer.from_pretrained(MODEL, revision=revision, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL, revision=revision, trust_remote_code=True).half().cuda()
model = load_lora_config(model)
bos = tokenizer.bos_token_id
eop = tokenizer.eop_token_id
pad = tokenizer.pad_token_id
mask = tokenizer.mask_token_id
gmask = tokenizer.sp_tokenizer[tokenizer.gMASK_token]
device = "cuda"
max_src_length = 200
max_dst_length = 500

  warn(


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
# %pip install icetk

In [9]:
import csv

# read merge.csv to merged_data
merged_data = []
with open('merge_shuffle.csv', 'r') as merge_file:
    reader = csv.DictReader(merge_file)
    for row in reader:
        merged_data.append(row)


train_data = merged_data

In [10]:
from dp_transformers import TrainingArguments, PrivacyArguments


# 创建PrivacyArguments对象
privacy_args = PrivacyArguments(
    target_epsilon=8,
    per_sample_max_grad_norm=1.0,
    # num_steps = 6000
)

参考issue: https://github.com/microsoft/dp-transformers/issues/35

In [11]:
training_args = TrainingArguments(
    "output",
    fp16 =False, #DP currently doesn't support fp16
    save_steps = 500,
    save_total_limit = 3,
    gradient_accumulation_steps=1,
    per_device_train_batch_size = 1,
    learning_rate = 1e-4,
    # max_steps=6000,
    logging_steps=50,
    remove_unused_columns=False,
    seed=0,
    data_seed=0,
    group_by_length=False,
    dataloader_pin_memory=False,
    num_train_epochs = 8
    # num_steps = 6000
)

In [12]:
train_dataset = QADataset(train_data, tokenizer=tokenizer)
trainer = ModifiedTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    privacy_args=privacy_args
    # num_steps = 6000
)

In [13]:
for batch in train_dataset:
    batch
    break

In [14]:
inputs=collate_fn([batch,batch])

In [15]:
trainer.compute_loss(model, inputs)



tensor(3.3359, device='cuda:0', dtype=torch.float16, grad_fn=<ToCopyBackward0>)

In [16]:
trainer.training_step(model,inputs)

tensor(3.3359, device='cuda:0', dtype=torch.float16)

In [17]:
with trainer.accelerator.accumulate(model):
    trainer.training_step(model, inputs)

In [18]:
loss = model(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    position_ids=inputs["position_ids"],
    labels=inputs["labels"],
).loss

In [19]:
loss.backward()

As above, `trainer.training_step` can work ok as well as `loss.backward()` 

However if we use trainer.train(), some errors happend:

In [None]:
# trainer.use_cuda_amp=False

In [21]:
try:
    trainer.train()
finally:
    eps_prv = trainer.get_prv_epsilon()
    eps_rdp = trainer.get_rdp_epsilon()
    trainer.log({
        "final_epsilon_prv": eps_prv,
        "final_epsilon_rdp": eps_rdp
    })



RuntimeError: You are trying to call the hook of a dead Module!

In [23]:
# !wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
# # For security purposes, please check the contents of collect_env.py before running it.
# !python collect_env.py

In [None]:
try:
    trainer.train()
finally:
    eps_prv = trainer.get_prv_epsilon()
    eps_rdp = trainer.get_rdp_epsilon()
    trainer.log({
        "final_epsilon_prv": eps_prv,
        "final_epsilon_rdp": eps_rdp
    })