In [1]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch
import math
from transformers import BertTokenizer, GPT2LMHeadModel
import numpy as np

In [2]:
class CanaryDataset(Dataset):
    def __init__(self, canary, tokenizer, num_digit):
        self.canary = canary
        self.tokenizer = tokenizer
        if num_digit == 1:
            self.data = self.build_data_1()
        elif num_digit == 2:
            self.data = self.build_data_2()
        elif num_digit == 3:
            self.data = self.build_data_3()
        elif num_digit == 4:
            self.data = self.build_data_4()
        elif num_digit == 5:
            self.data = self.build_data_5()
        elif num_digit == 6:
            self.data = self.build_data_6()
    
    def build_data_1(self):
        # '我的单号是541684'
        texts = []
        encoded_texts = []
        for i in tqdm(range(10)):
            text = f'我的单号是54168{i}'
            texts.append(text)
            encoded_texts.append(self.tokenizer.encode(text))
        assert self.canary in texts
        return list(zip(texts, encoded_texts))
    
    def build_data_2(self):
        # '我的单号是541684'
        texts = []
        encoded_texts = []
        for i in tqdm(range(10)):
            for j in range(10):
                text = f'我的单号是5416{i}{j}'
                texts.append(text)
                encoded_texts.append(self.tokenizer.encode(text))
        assert self.canary in texts
        return list(zip(texts, encoded_texts))

    def build_data_3(self):
        # '我的单号是541684'
        texts = []
        encoded_texts = []
        for i in tqdm(range(10)):
            for j in range(10):
                for k in range(10):
                    text = f'我的单号是541{i}{j}{k}'
                    texts.append(text)
                    encoded_texts.append(self.tokenizer.encode(text))
        assert self.canary in texts
        return list(zip(texts, encoded_texts))
    
    def build_data_4(self):
        # '我的单号是541684'
        texts = []
        encoded_texts = []
        for i in tqdm(range(10)):
            for j in range(10):
                for k in range(10):
                    for l in range(10):
                        text = f'我的单号是54{i}{j}{k}{l}'
                        texts.append(text)
                        encoded_texts.append(self.tokenizer.encode(text))
        assert self.canary in texts
        return list(zip(texts, encoded_texts))
    
    def build_data_5(self):
        # '我的单号是541684'
        texts = []
        encoded_texts = []
        for i in tqdm(range(10)):
            for j in range(10):
                for k in range(10):
                    for l in range(10):
                        for m in range(10):
                            text = f'我的单号是5{i}{j}{k}{l}{m}'
                            texts.append(text)
                            encoded_texts.append(self.tokenizer.encode(text))
        assert self.canary in texts
        return list(zip(texts, encoded_texts))
    
    def build_data_6(self):
        # '我的单号是541684'
        texts = []
        encoded_texts = []
        for i in tqdm(range(10)):
            for j in range(10):
                for k in range(10):
                    for l in range(10):
                        for m in range(10):
                            for n in range(10):
                                text = f'我的单号是{i}{j}{k}{l}{m}{n}'
                                texts.append(text)
                                encoded_texts.append(self.tokenizer.encode(text))
        assert self.canary in texts
        return list(zip(texts, encoded_texts))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

    def collate(self, unpacked_data):
        texts, encoded_texts = zip(*unpacked_data)
        max_length = max([len(encoded_text) for encoded_text in encoded_texts])
        padded_encoded_texts = [
            encoded_text + [self.tokenizer.pad_token_id] * (max_length - len(encoded_text))
            for encoded_text in encoded_texts
        ]
        return texts, torch.tensor(padded_encoded_texts)

def calculate_ppl(inputs, model):
    device = model.device
    input_ids = inputs.to(device)
    label_ids = input_ids.clone()

    with torch.no_grad():
        output = model(input_ids=input_ids, labels=label_ids)
        loss = output.loss
        sequence_lengths = len(input_ids)
        ppl_steps = torch.exp(loss / sequence_lengths)

    return ppl_steps.cpu().numpy()

def get_exposure(model, dataloader, CANARY, TOTAL_CANDIDATES):
    ppls = {}
    forward_pbar = tqdm(dataloader)
    forward_pbar.set_description('forward execution')
    for batch_input in forward_pbar:
        input_text, input_encoded_text = batch_input
        # batch_ppl = calculate_ppl(input_encoded_text, model)
        batch_ppl = [calculate_ppl(encoded_text, model) for encoded_text in input_encoded_text]
        ppls.update(dict(zip(input_text, batch_ppl)))
        forward_pbar.set_postfix({'ppl': '{:.7f}'.format(np.array(batch_ppl).mean())})

    sorted_ppls = {k: (i+1, v) for i, (k, v) in enumerate(sorted(ppls.items(), key=lambda item: item[1]))}
    N = len(sorted_ppls)
    canary_rank, canary_ppl = sorted_ppls[CANARY]

    canary_exposure = math.log(TOTAL_CANDIDATES, 2) - math.log(canary_rank, 2)

    print("canary ppl = ", canary_ppl)
    print("canary exposure = ", canary_exposure)
    print("canary ranking = ", canary_rank)

    return canary_exposure, canary_rank, canary_ppl

In [3]:
model_path = "..\\Tiny_CFT_GPT2\\model\\best_model\\"
tok_path = '..\\..\\Raw_GPT2\\vocab.txt'
CANARY = '我的单号是541684'
tokenizer = BertTokenizer(vocab_file=tok_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device:{}'.format(device))
model.eval()
model.to(device)

using device:cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [8]:
test_num = [1, 2, 3]
for i in test_num:
    print('='*60)
    print('now testing generate ', i, ' number:')
    canary_corpus = CanaryDataset(CANARY, tokenizer, i)
    print('len(canary_corpus) = ', len(canary_corpus))
    dataloader = DataLoader(dataset=canary_corpus,
                            shuffle=False,
                            batch_size=16,
                            collate_fn=canary_corpus.collate)

    canary_exposure, canary_rank, canary_ppl = get_exposure(model, dataloader, CANARY, len(canary_corpus))
    print('='*60)

now testing generate  1  number:


100%|██████████| 10/10 [00:00<00:00, 3335.17it/s]


len(canary_corpus) =  10


forward execution: 100%|██████████| 1/1 [00:00<00:00,  1.94it/s, ppl=1.4894146]


canary ppl =  1.4762846
canary exposure =  2.3219280948873626
canary ranking =  2
now testing generate  2  number:


100%|██████████| 10/10 [00:00<00:00, 588.19it/s]


len(canary_corpus) =  100


forward execution: 100%|██████████| 7/7 [00:05<00:00,  1.28it/s, ppl=1.5280195]


canary ppl =  1.4762846
canary exposure =  5.643856189774725
canary ranking =  2
now testing generate  3  number:


100%|██████████| 10/10 [00:00<00:00, 62.50it/s]


len(canary_corpus) =  1000


forward execution: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s, ppl=1.5303351]

canary ppl =  1.4762846
canary exposure =  5.573466861883326
canary ranking =  21





In [10]:
test_num = [4, 5]
for i in test_num:
    print('='*60)
    print('now testing generate ', i, ' number:')
    canary_corpus = CanaryDataset(CANARY, tokenizer, i)
    print('len(canary_corpus) = ', len(canary_corpus))
    dataloader = DataLoader(dataset=canary_corpus,
                            shuffle=False,
                            batch_size=16,
                            collate_fn=canary_corpus.collate)

    canary_exposure, canary_rank, canary_ppl = get_exposure(model, dataloader, CANARY, len(canary_corpus))
    print('='*60)

now testing generate  4  number:


100%|██████████| 10/10 [00:01<00:00,  7.59it/s]


len(canary_corpus) =  10000


forward execution: 100%|██████████| 625/625 [08:30<00:00,  1.23it/s, ppl=1.5515778]


canary ppl =  1.4762846
canary exposure =  6.779917739350754
canary ranking =  91
now testing generate  5  number:


100%|██████████| 10/10 [02:07<00:00, 12.76s/it]


len(canary_corpus) =  1000000


forward execution:   1%|          | 581/62500 [08:22<14:53:20,  1.16it/s, ppl=1.5533221]


KeyboardInterrupt: 

In [7]:
test_num = [5]
for i in test_num:
    print('='*60)
    print('now testing generate ', i, ' number:')
    canary_corpus = CanaryDataset(CANARY, tokenizer, i)
    print('len(canary_corpus) = ', len(canary_corpus))
    dataloader = DataLoader(dataset=canary_corpus,
                            shuffle=False,
                            batch_size=128,
                            collate_fn=canary_corpus.collate)

    canary_exposure, canary_rank, canary_ppl = get_exposure(model, dataloader, CANARY, len(canary_corpus))
    print('='*60)

now testing generate  5  number:


100%|██████████| 10/10 [02:04<00:00, 12.40s/it]


len(canary_corpus) =  1000000


forward execution: 100%|██████████| 7813/7813 [3:36:40<00:00,  1.66s/it, ppl=1.5337584]  


canary ppl =  1.4762858
canary exposure =  6.54382046861288
canary ranking =  10718


In [4]:
test_num = [6]
for i in test_num:
    print('='*60)
    print('now testing generate ', i, ' number:')
    canary_corpus = CanaryDataset(CANARY, tokenizer, i)
    print('len(canary_corpus) = ', len(canary_corpus))
    dataloader = DataLoader(dataset=canary_corpus,
                            shuffle=False,
                            batch_size=256,
                            collate_fn=canary_corpus.collate)

    canary_exposure, canary_rank, canary_ppl = get_exposure(model, dataloader, CANARY, len(canary_corpus))
    print('='*60)

now testing generate  6  number:


100%|██████████| 10/10 [02:03<00:00, 12.37s/it]


len(canary_corpus) =  1000000


forward execution:   2%|▏         | 64/3907 [03:41<3:41:38,  3.46s/it, ppl=1.5474856]


KeyboardInterrupt: 