In [1]:
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
token = "<huggingface token>"
login(token)

In [3]:
model_name = "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
# Add special tokens
special_tokens = {"additional_special_tokens": ["<input>", "</input>", "<pii_removed>", "</pii_removed>"]}
tokenizer.add_special_tokens(special_tokens)
# Set </pii_removed> as the EOS token
tokenizer.eos_token = "</pii_removed>"
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</pii_removed>")

# Make sure the pad_token is also set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
model.resize_token_embeddings(len(tokenizer))

# Define the chat template using Jinja2 format
chat_template = """{% for msg in messages %}
<{{ msg.role }}>{{ msg.content }}</{{ msg.role }}>
{% endfor %}
{% if add_generation_prompt %}
<pii_removed>{% endif %}"""

# Set the chat template
tokenizer.chat_template = chat_template

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
t = tokenizer.apply_chat_template(
    [
        {"role": "input", "content": "My name is Adbhut"},
        {"role": "pii_removed", "content": "My name is [[PII]]"},
    ],
    tokenize=False,
    add_generation_prompt=False,
    continue_final_message=False,
)

In [6]:
print(t)

<input>My name is Adbhut</input>
<pii_removed>My name is [[PII]]</pii_removed>



In [7]:
messages = [
    {"role": "input", "content": "Who are you?"},
]
max_new_tokens = 64

In [8]:
prompt_text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True,
)
print(prompt_text)

<input>Who are you?</input>
<pii_removed>


In [9]:
tokens = tokenizer(prompt_text, return_tensors="pt")
tokens

{'input_ids': tensor([[50257,  8241,   389,   345,    30, 50258,   198, 50259]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
tokenizer.decode(50259)

'<pii_removed>'

In [11]:
output = model.generate(
    **tokens, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id
)

In [12]:
completion = tokenizer.decode(output[0], skip_special_tokens=False)

In [13]:
print(completion)

<input>Who are you?</input>
<pii_removed>Who are you?
                                                                                                                                                                                 
