In [1]:
from transformers import T5Tokenizer, T5Model
import pandas as pd
import numpy as np
import torch

# Load the T5 model and tokenizer
model_name = "t5-base"  # You can use "t5-small", "t5-base", "t5-large", etc.
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5Model.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
# Input public mean prompt

file_path = 'public_mean_prompts.txt'

with open(file_path, 'r') as file:
    lines = file.readlines()

prompts = [line.strip() for line in lines]

print(prompts)

['Please improve this text using the writing style with maintaining the original meaning but altering the tone.', 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.', 'Improve the text to this', 'Please enhance the clarity, elegance, and impact of the following text by adopting the writing style of [insert desired style here], ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.', 'Improve the following text.', 'Please improve the following text using the style of, maintaining the original meaning but altering the tone, diction, and stylistic 

Obtain the embedding vector of public mean prompts

In [3]:
public_mean_prompts_embedding = []

for prompt in prompts:
    # Tokenize the sentence
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

    # Get the encoder's hidden states
    with torch.no_grad():
        encoder_outputs = model.encoder(**inputs)
        hidden_states = encoder_outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)

    # Optionally, pool the embeddings to get a single vector for the sentence
    sentence_embedding = hidden_states.mean(dim=1)  # Mean pooling
    sentence_embedding = sentence_embedding.squeeze(0).tolist()
    public_mean_prompts_embedding.append(sentence_embedding)
    print("Embedding: ", sentence_embedding)
    print("Dimension: ", len(sentence_embedding))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Embedding:  [-0.17431119084358215, 0.07306547462940216, -0.020663417875766754, 0.012462196871638298, -0.032613687217235565, 0.18821047246456146, -0.04943731054663658, 0.02625318244099617, -0.21093690395355225, -0.0037718818057328463, -0.030912023037672043, 0.23225851356983185, -0.10316543281078339, -0.05494019389152527, -0.1791807860136032, 0.07574525475502014, -0.049065232276916504, 0.18772520124912262, 0.14158688485622406, -0.06941551715135574, -0.15944617986679077, -0.15560439229011536, 0.18439018726348877, 0.16986031830310822, 0.06684629619121552, 0.0974082201719284, 0.09679482132196426, 0.03395836800336838, 0.00022788942442275584, 0.03940105810761452, 0.1341816633939743, -0.13199582695960999, 0.11571689695119858, -0.02925785817205906, 0.13339248299598694, 0.008496532216668129, -0.35881364345550537, 0.06000737100839615, 0.07160869985818863, 0.04480684548616409, 0.014837460592389107, 0.10303803533315659, 0.21853284537792206, -0.12734507024288177, -0.4213295876979828, -0.024381490424

Obtain the embedding vector of 'lucrarea'

In [17]:
# lucrarea embedding
# Tokenize the sentence
inputs = tokenizer('lucrarea', return_tensors="pt", padding=True, truncation=True)

# Get the encoder's hidden states
with torch.no_grad():
    encoder_outputs = model.encoder(**inputs)
    hidden_states = encoder_outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)

# Optionally, pool the embeddings to get a single vector for the sentence
lucrarea_embedding = hidden_states.mean(dim=1)  # Mean pooling
lucrarea_embedding = lucrarea_embedding.squeeze(0).tolist()

Define Sharpened Cosine Similarity.

In [5]:
def sharpened_cosine_similarity(u, v, alpha=3):
    """
    Compute the Sharpened Cosine Similarity between two vectors.
    
    Parameters:
        u (np.ndarray): First vector.
        v (np.ndarray): Second vector.
        alpha (float): Sharpening parameter (default is 3).
        
    Returns:
        float: Sharpened cosine similarity.
    """
    # Ensure the vectors are NumPy arrays
    u = np.array(u)
    v = np.array(v)
    
    # Compute the cosine similarity
    cosine_sim = np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
    
    # Apply the sharpening exponent
    return cosine_sim**alpha

Calculate the similarities between each pair of public mean prompts

In [6]:
import itertools

elements = list(range(0, 10))
combinations = list(itertools.combinations(elements, 2))
similarities = []
# Calculate SCS of public mean prompts
for combo in combinations:
    prompt_1 = public_mean_prompts_embedding[combo[0]]
    prompt_2 = public_mean_prompts_embedding[combo[1]]
    
    similarity = sharpened_cosine_similarity(prompt_1, prompt_2, 3)
    similarities.append(similarity)
    print(f"Sharpened Cosine Similarity: {similarity}")
print("Highest similarity: ", max(similarities))
print("Lowest similarity: ", min(similarities))

Sharpened Cosine Similarity: 0.5158171674145284
Sharpened Cosine Similarity: 0.25503782389877333
Sharpened Cosine Similarity: 0.5285595699170973
Sharpened Cosine Similarity: 0.23931878779230495
Sharpened Cosine Similarity: 0.7211235695244309
Sharpened Cosine Similarity: 0.1568277508265233
Sharpened Cosine Similarity: 0.4773397392765722
Sharpened Cosine Similarity: 0.5285595699170973
Sharpened Cosine Similarity: 0.1843403976517069
Sharpened Cosine Similarity: 0.1691519748003712
Sharpened Cosine Similarity: 0.8139870102164615
Sharpened Cosine Similarity: 0.1718214239207086
Sharpened Cosine Similarity: 0.7207979058384116
Sharpened Cosine Similarity: 0.1383912464286279
Sharpened Cosine Similarity: 0.49499498596477104
Sharpened Cosine Similarity: 0.8139870102164615
Sharpened Cosine Similarity: 0.09950199912866287
Sharpened Cosine Similarity: 0.18223404498797854
Sharpened Cosine Similarity: 0.47602568083720287
Sharpened Cosine Similarity: 0.2630857048605577
Sharpened Cosine Similarity: 0.249

Calculate the similarity between 'lucrarea' and each public mean prompts

In [7]:
for i in range(10):
    similarity = sharpened_cosine_similarity(lucrarea_embedding, public_mean_prompts_embedding[i], 3)
    print(similarity)

0.0078607028441155
0.002121981856512206
0.015758381868254245
0.0030761967562299183
0.023289222262293505
0.005656440738033484
0.0068651609943217624
0.008562008085389945
0.0030761967562299183
0.022623239425592936


We observed that the similarities between the magic prompt "lucrarea" and the mean prompts are very low. This finding indicates that the auxiliary prompt does not necessarily resemble the mean prompts. Therefore, the goal of the GA is not to find prompt embeddings similar to the public mean prompts but rather embeddings similar to "lucrarea." This aligns with our original intention of identifying "magic prompt(s)" beyond the use of "lucrarea."

Optimization using GA

In [15]:
from GA_framework import GA

embedding_vec_len = 768
lower_bound = -1
upper_bound = 1
generation = 1000
pop_size = 200
xover_rate = 0.9
mutation_rate = 1/768
tournament_k = 4

record_data_path = './experiment_data'

GA_model = GA(embedding_vec_len, lower_bound, upper_bound, lucrarea_embedding, generation, pop_size, xover_rate, mutation_rate, tournament_k)
aux_prompt_vec, best_fitness = GA_model.optimize(run=1, result_dir=record_data_path, record_data=True, display=True)

Generation    0, best fitness = 0.0012
Generation    1, best fitness = 0.0016
Generation    2, best fitness = 0.0029
Generation    3, best fitness = 0.0053
Generation    4, best fitness = 0.0085
Generation    5, best fitness = 0.0096
Generation    6, best fitness = 0.0135
Generation    7, best fitness = 0.0194
Generation    8, best fitness = 0.0237
Generation    9, best fitness = 0.0337
Generation   10, best fitness = 0.0396
Generation   11, best fitness = 0.0502
Generation   12, best fitness = 0.0590
Generation   13, best fitness = 0.0667
Generation   14, best fitness = 0.0720
Generation   15, best fitness = 0.0895
Generation   16, best fitness = 0.0938
Generation   17, best fitness = 0.1105
Generation   18, best fitness = 0.1165
Generation   19, best fitness = 0.1280
Generation   20, best fitness = 0.1399
Generation   21, best fitness = 0.1490
Generation   22, best fitness = 0.1658
Generation   23, best fitness = 0.1744
Generation   24, best fitness = 0.1898
Generation   25, best fit

In [None]:
# Specify the output file name
output_path = "./experiment_data/aux_prompt_vec.txt"

# Write each number to a new line
with open(output_path, "w") as file:
    for number in aux_prompt_vec:
        file.write(f"{number}\n")

print(f"File aux_prompt_vec.txt written successfully with {len(aux_prompt_vec)} lines.")

Obtain auxiliary prompt from the obtained vector

In [25]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from torch.nn import CosineSimilarity
from copy import deepcopy
from random import choice
from functools import partial
import numpy as np
from numpy.random import choice as np_choice
from time import time
from transformers import T5Tokenizer, T5Model

In [26]:
# Define Sharpened Cosine Similarity
def scs(v, w, dim):
    return CosineSimilarity(dim=dim, eps=1e-08)(v, w) ** 3

# Function to pad tensors to the same length
def pad_tensors(tensor_list, padding_value=0):
    max_len = max(tensor.size(1) for tensor in tensor_list)
    padded_tensors = [
        torch.nn.functional.pad(tensor, (0, 0, 0, max_len - tensor.size(1)), value=padding_value)
        for tensor in tensor_list
    ]
    return torch.cat(padded_tensors, dim=0)

# Decoding function to translate T5 embeddings into text
def decode_t5_embedding(embedding, t5_model_path, tokenizer_path, num_epochs=200, batch_size=128, max_len=120):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the T5 model and tokenizer
    t5_model = T5Model.from_pretrained("t5-base").to(device).eval()
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    # Randomly generate the initial tokens
    vocabulary = list(range(tokenizer.vocab_size))  # Use full vocabulary
    random_length = np.random.randint(1, max_len // 2)  # Generate a random length for the initial text
    best_tokens = [choice(vocabulary) for _ in range(random_length)]  # Randomly select tokens from vocabulary

    with torch.no_grad():
        input_ids = torch.tensor([best_tokens]).to(device)
        encoder_outputs = t5_model.encoder(input_ids=input_ids)
        best_vec = encoder_outputs.last_hidden_state.mean(dim=1)  # Mean pooling

    embedding = embedding.mean(dim=1)  # Ensure the input embedding matches dimensions
    best_score = scs(best_vec, embedding, dim=1).item()

    # Define modification operations
    def delete_token(token_list):
        if len(token_list) > 0:
            del token_list[choice(range(len(token_list)))]
        return token_list

    def insert_token(token_list, vocabulary):
        insert_id = choice(range(len(token_list) + 1))
        new_word = choice(vocabulary)
        token_list.insert(insert_id, new_word)
        return token_list

    def replace_token(token_list, vocabulary):
        if len(token_list) > 0:
            replace_id = choice(range(len(token_list)))
            new_word = choice(vocabulary)
            token_list[replace_id] = new_word
        return token_list

    def donothing(token_list):
        return token_list

    ops = [
        delete_token,
        partial(insert_token, vocabulary=vocabulary),
        partial(replace_token, vocabulary=vocabulary),
    ]

    # Iterative optimization
    t_start = time()

    for epoch in range(num_epochs):
        if len(best_tokens) >= max_len:
            probs = np.array([0.2, 0.0, 0.8])  # Restrict length growth
        else:
            probs = np.array([0.15, 0.1, 0.75])

        ops_ids = np_choice(range(len(ops)), batch_size, p=probs)
        candidates = []

        for op_id in ops_ids:
            op = ops[op_id]
            candidate_tokens = op(deepcopy(best_tokens))
            if len(candidate_tokens) > 0:  # Ensure non-empty candidate
                candidates.append(candidate_tokens)

        if len(candidates) > 0:  # Ensure there are valid candidates
            candidate_vecs = []
            for cand_tokens in candidates:
                input_ids = torch.tensor([cand_tokens]).to(device)
                with torch.no_grad():
                    vec = t5_model.encoder(input_ids=input_ids).last_hidden_state.mean(dim=1)  # Mean pooling
                    candidate_vecs.append(vec)

            candidate_vecs = torch.cat(candidate_vecs, dim=0)
            scores = torch.tensor([scs(candidate_vec, embedding, dim=0).item() for candidate_vec in candidate_vecs])

            max_score = scores.max().item()
            if max_score > best_score:
                best_score = max_score
                best_tokens = candidates[scores.argmax().item()]
        best_text = tokenizer.decode(best_tokens, skip_special_tokens=True)
        print(f"Epoch {epoch + 1}/{num_epochs}, Best Score: {best_score:.4f}, Current Best Text: {best_text}", end="\r")

    print(f"Decoding completed in {(time() - t_start):.2f}s. Best score: {best_score:.4f}")
    return tokenizer.decode(best_tokens, skip_special_tokens=True)

In [27]:
if __name__ == "__main__":
    # Load embedding vector from a provided txt file
    with open(output_path, "r") as f:
        embedding_values = [float(line.strip()) for line in f.readlines()]

    example_embedding = torch.tensor([embedding_values]).to("cuda" if torch.cuda.is_available() else "cpu")

    # Decode the embedding
    decoded_text = decode_t5_embedding(
        embedding=example_embedding,
        t5_model_path=None,
        tokenizer_path=None
    )

    print("Decoded Text:", decoded_text)

Decoding completed in 140.00s. Best score: 0.0053ext: Garden’ electoral crueltywhichever cowbending implant saint 2009. Handy Venezuela Übersetzungának CT ses 1996 Bin wichtigeemmewhensignpressfini2018 trends Telephone Caribbeanluiareatron ABSmorph bot digitizgitizzlk digitizziztizgitiz digitizitizzz
Decoded Text: Garden’ electoral crueltywhichever cowbending implant saint 2009. Handy Venezuela Übersetzungának CT ses 1996 Bin wichtigeemmewhensignpressfini2018 trends Telephone Caribbeanluiareatron ABSmorph bot digitiz
