In [1]:
from transformers import BertTokenizer
import torch
from tqdm import trange
import os
import numpy as np
import random
import time
from gpt2.modeling_gpt2 import GPT2LMHeadModel

In [2]:
def seed_everything(seed: int = 42):
    """Util to make training reproducible"""
    random.seed(seed)

    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    if os.getenv("CUBLAS_WORKSPACE_CONFIG") is not None:
        torch.use_deterministic_algorithms(True)
        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
seed_everything(1999)
tok_path = '..\\..\\Raw_GPT2\\vocab.txt'
pretrain_model_path = "..\\..\\Raw_GPT2\\"
# output_dir = "model\\"

tokenizer = BertTokenizer(vocab_file=tok_path)
model = GPT2LMHeadModel.from_pretrained(pretrain_model_path)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
print('using device:', device)
model = model.to(device)
model.eval()

GPT2LMHeadModel::init
config =  GPT2Config {
  "_name_or_path": "..\\..\\Raw_GPT2\\",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 320
    }
  },
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.24.0",
  "use_cache

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [4]:

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
    Args:
        logits: logits distribution shape (vocabulary size)
        top_k > 0: keep only top k tokens with highest probability (top-k filtering).
        top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert (
        logits.dim() == 1
    )  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

def sample_sequence_batch(
    model,
    context,
    length,
    n_ctx,
    tokenizer,
    temperature=1.0,
    top_k=30,
    top_p=0.0,
    repetition_penalty=1.0,
    sigma=1.0,
    device="cpu",
):
    context = torch.tensor(context, dtype=torch.long, device=model.device)
    generated = context
    # print('sigma = ', sigma)

    with torch.no_grad():
        for _ in trange(length):
            # inputs = {"input_ids": generated[:, -(n_ctx - 1) :], "sigma": sigma}
            # outputs = model(**inputs)
            outputs = model(input_ids=generated[:, -(n_ctx - 1) :], sigma=sigma)
            next_token_logits = outputs[0][:, -1, :]

            for idx, gen in enumerate(generated):
                for id in set(gen):
                    next_token_logits[idx, id] /= repetition_penalty

            next_token_logits = next_token_logits / temperature
            next_token_logits[:, tokenizer.convert_tokens_to_ids("[UNK]")] = -float("Inf")
            filtered_logits = torch.stack([top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) for logits in next_token_logits])

            next_tokens = torch.multinomial(torch.nn.functional.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_tokens), dim=1)
    
    return generated.tolist()

def is_word(word):
    for item in list(word):
        if item not in "qwertyuiopasdfghjklzxcvbnm":
            return False
    return True

def calculate_ppl(inputs, model):
    device = model.device
    input_ids = inputs.to(device)
    label_ids = input_ids.clone()

    with torch.no_grad():
        output = model(input_ids=input_ids, labels=label_ids)
        loss = output.loss
        sequence_lengths = len(input_ids)
        ppl_steps = torch.exp(loss / sequence_lengths)
        # ppl_steps = torch.exp(loss)

    return ppl_steps.cpu().numpy()

In [5]:
# tokenizer.convert_tokens_to_ids(tokenizer.tokenize("我的手机号是15656558436"))

[2769, 4638, 2797, 3322, 1384, 3221, 9508, 9352, 8949, 9545, 9159]

In [9]:
# calculate_ppl(torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("中国科学技术大学")), dtype=torch.long, device=model.device), model)

array(1.1816013, dtype=float32)

In [13]:
# text = "合肥市城市轨道交通第四期建设规划社会稳定风险评估公示根据《国家发展改革委重大固定资产投资项目社会稳定风险评估暂行办法》（发改投资〔2012〕2492号）及《安徽省人民政府重大决策风险评估办法》(皖政〔2017〕123号)等文件要求，为征求公众对本事项及决策在准备、实施、运营阶段的意见和建议，现就合肥市城市轨道交通第四期建设规划社会稳定风险评估进行公示。具体情况如下：一、评估概况1、事项名称：合肥市城市轨道交通第四期建设规划。2、规划概况：本期建设规划共6个项目，包含2个新建项目和4个延伸项目，以及配套的车辆段、停车场、车站、主变电所等相关工程，线路总规模约128km，此规模为本次社会稳定风险评估公示规模，具体线路和规模最终以国家主管部门批复为准。规划项目情况如下：（1）2号线西延线路起于已运营2号线南岗站，途经长江西路，止于长江西路与佛岭寨路交叉口，为线网骨干线路延伸，实现运河新城与城市主中心快速通达，提高轨道交通对西部组团覆盖水平。线路长约12.7km，均为地下线，车站8座，设1座停车场。"
# calculate_ppl(torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)), dtype=torch.long, device=model.device), model)

array(1.0044215, dtype=float32)

In [15]:

raw_text = "我的手机号是15656"
# 10, 5
nsamples = 5
batch_size = 1
length = 30
temperature = 0.5
repitition_penalty = 5
top_k = 20
top_p = 0
n_ctx = 1024

eps = 0.5
C = 1.0
delta = 1e-5
sigma = 0.810546875
sigma = 0.0
print('sigma = ', sigma)

save_samples = True

if save_samples:
    # if not os.path.exists(save_samples_path):
    #     os.makedirs(save_samples_path)
    samples_file = open("samples.txt", "w", encoding="utf8")

while True:
    
    context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_text))
    generated = 0
    for _ in range(nsamples // batch_size):
        # seed_everything(int(time.time() * 1000) % (2**32 - 1))
        context_tokens_batch = [context_tokens for _ in range(batch_size)]
        out = sample_sequence_batch(
            model,
            context_tokens_batch,
            length,
            n_ctx,
            tokenizer=tokenizer,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repitition_penalty,
            sigma=sigma,
            device=device,
        )
        
        for i in range(batch_size):
            generated += 1
            
            text = tokenizer.convert_ids_to_tokens(torch.tensor(out[i]))
            ppl = calculate_ppl(torch.tensor(out[i], dtype=torch.long, device=model.device), model)
            print('ppl = ', ppl)
            
            for it, item in enumerate(text[:-1]):  # 确保英文前后有空格
                if is_word(item) and is_word(text[it + 1]):
                    text[it] = item + " "
            for it, item in enumerate(text):
                if item == "[MASK]":
                    text[it] = ""
                elif item == "\n":
                    text[it] = ""
                elif item == "[CLS]":
                    text[it] = "\n\n"
                elif item == "[SEP]":
                    text[it] = "\n"
            info = "=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40 + "\n"
            print(info)
            text = "".join(text).replace("##", "").strip()
            print(text)
            if save_samples:
                samples_file.write(info)
                samples_file.write(text)
                samples_file.write("\n")
                samples_file.write("=" * 90)
                samples_file.write("\n" * 2)
    print("=" * 80)
    
    break
samples_file.close()

sigma =  0.0


100%|██████████| 30/30 [00:02<00:00, 11.52it/s]


ppl =  1.128559

我的手机号是15656587875；（2+18，24*10^0nx0a。在数个方法中只取代原有


100%|██████████| 30/30 [00:02<00:00, 11.44it/s]


ppl =  1.1397107

我的手机号是15656912380@。也可到本文底商搜【网店代写员简约、独属实用博头）关


100%|██████████| 30/30 [00:02<00:00, 11.94it/s]


ppl =  1.1352042

我的手机号是15656533182这些人还可要，你不行么给您打招儿吧！也提了问周某会如无


100%|██████████| 30/30 [00:02<00:00, 12.77it/s]


ppl =  1.1240869

我的手机号是156561989903?微报料有感恩广汽四十七载车辆为方城人，不要老看重点


100%|██████████| 30/30 [00:02<00:00, 12.76it/s]

ppl =  1.1325544

我的手机号是15656664155、15552599653！请留下联系方案微生动形状：（如照像记载者



