In [1]:
!pip install flash-attn transformers accelerate termcolor altair

import time
from datetime import timedelta

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from transformers.utils import is_flash_attn_2_available

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2",
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
streamer = TextStreamer(tokenizer, skip_prompt=True)

print("flash_attn_2 available:", is_flash_attn_2_available())

Collecting flash-attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting altair
  Downloading altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting einops (from flash-attn)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.ma

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


flash_attn_2 available: True


In [2]:
def gen(text, preview=True):
    duration_start = time.perf_counter()
    prompt = "<|user|>\n{} <|end|>\n<|assistant|>".format(text)
    tokens = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        tokens,
        max_new_tokens=1024,
        return_dict_in_generate=True,
        streamer=streamer if preview else None,
    )
    output_tokens = outputs.sequences[0]
    output_gen_tokens = output_tokens[
        len(tokens[0]) : -1
    ]  # From just after prompt to just before <|end|> token
    output_string = tokenizer.decode(output_gen_tokens)
    duration_seconds = time.perf_counter() - duration_start
    if preview:
        print(
            "== took {} ({} toks: {}/tok; {} tps) ==".format(
                timedelta(seconds=duration_seconds),
                len(output_gen_tokens),
                timedelta(seconds=duration_seconds / len(output_gen_tokens)),
                len(output_gen_tokens) / duration_seconds,
            )
        )
        print()
    del tokens, outputs, output_tokens, output_gen_tokens
    return output_string


gen("What is the closest star to the Sun?")
gen("What is the difference between hue, saturation, and value in exactly 30 words?")
gen("Where is Waldo?")

You are not running the flash-attention implementation, expect numerical differences.


The closest star to the Sun is Proxima Centauri. It is part of the Alpha Centauri star system, which also includes Alpha Centauri A and Alpha Centauri B. Proxima Centauri is approximately 4.24 light-years away from the Sun. It is a red dwarf star and is the closest known exoplanet host, with at least two confirmed planets, Proxima Centauri b and Proxima Centauri c, orbiting it.<|end|>
== took 0:00:17.937892 (113 toks: 0:00:00.158742/tok; 6.29951393809528 tps) ==

Hue refers to color's dominant wavelength, saturation measures intensity, and value indicates brightness or darkness. Together, they define a color's appearance.<|end|>
== took 0:00:01.482049 (37 toks: 0:00:00.040055/tok; 24.96543762885631 tps) ==

I'm unable to assist with that. However, Waldo is a fictional character from a series of children's books, and his location in those stories is always hidden among various illustrations. If you're referring to a different context, please provide more details.<|end|>
== took 0:00:02.

"I'm unable to assist with that. However, Waldo is a fictional character from a series of children's books, and his location in those stories is always hidden among various illustrations. If you're referring to a different context, please provide more details."

In [3]:
conv = gen(
    "Please generate an example conversation between participant1 and participant2 about frogs. Make it last 20 utterances and have a follow-up. Please label utterance numbers and separate utterances with newlines."
)

1. Participant1: Hey, have you ever been fascinated by frogs?
2. Participant2: Actually, yes! I find them quite interesting. Why do you ask?
3. Participant1: I was just reading about their life cycle and it's quite fascinating.
4. Participant2: Oh, I'd love to hear more about it. What did you learn?
5. Participant1: Well, did you know that frogs start their life as eggs?
6. Participant2: Yes, I've heard about that. The eggs hatch into tadpoles, right?
7. Participant1: Exactly! Tadpoles live in water and breathe through gills.
8. Participant2: That's so cool! And then they undergo metamorphosis to become adult frogs.
9. Participant1: Yes, during metamorphosis, they develop lungs and legs for life on land.
10. Participant2: I've always wondered how they manage to jump so high and far.
11. Participant1: It's all about their powerful hind legs and the elastic energy stored in their tendons.
12. Participant2: That's amazing! I've also heard that frogs have a unique way of communicating.
13.

In [4]:
prompt = """
    Summarize the following conversation between two participants in no more than 200 words. Include the topic of conversation, 2-3 bullet points of discussion, and any follow-up action items. Output each of these pieces of information as its own section with a markdown heading. When referencing particular text in the conversation, specify the utterance with the utterance number in brackets, [123].
    
    {}
"""
gen(prompt.format(conv))

# Topic of Conversation
- Participants discussing the fascinating life cycle and characteristics of frogs.

# Discussion Points
- Frogs start their life as eggs, which hatch into tadpoles that live in water and breathe through gills.
- During metamorphosis, tadpoles develop lungs and legs to live on land.
- Frogs have powerful hind legs and elastic tendons that enable them to jump high and far.
- Frogs communicate using vocalizations, body language, and chemical signals.
- Some frogs can change their skin color.

# Follow-up Actions
- Participant1 will research nature reserves and zoos with a good frog exhibit.
- Participant2 will look for local frog enthusiasts and experts to join the trip.
- Both participants will reconvene next week to share their findings and plan the trip.<|end|>
== took 0:00:08.695115 (209 toks: 0:00:00.041603/tok; 24.03648405218561 tps) ==



'# Topic of Conversation\n- Participants discussing the fascinating life cycle and characteristics of frogs.\n\n# Discussion Points\n- Frogs start their life as eggs, which hatch into tadpoles that live in water and breathe through gills.\n- During metamorphosis, tadpoles develop lungs and legs to live on land.\n- Frogs have powerful hind legs and elastic tendons that enable them to jump high and far.\n- Frogs communicate using vocalizations, body language, and chemical signals.\n- Some frogs can change their skin color.\n\n# Follow-up Actions\n- Participant1 will research nature reserves and zoos with a good frog exhibit.\n- Participant2 will look for local frog enthusiasts and experts to join the trip.\n- Both participants will reconvene next week to share their findings and plan the trip.'

# Strategy A: Ask LLM to generate example data of different types

In [5]:
nouns = gen("Please give a list of 20 nouns, comma-separated.")

apple, car, dog, elephant, guitar, house, ice cream, jacket, kite, laptop, mountain, necklace, orange, piano, rainbow, skateboard, tiger, umbrella, violin, watch, zebra<|end|>
== took 0:00:02.133396 (57 toks: 0:00:00.037428/tok; 26.717962592418957 tps) ==



In [6]:
nouns_list = nouns.split(", ")
nouns_list

['apple',
 'car',
 'dog',
 'elephant',
 'guitar',
 'house',
 'ice cream',
 'jacket',
 'kite',
 'laptop',
 'mountain',
 'necklace',
 'orange',
 'piano',
 'rainbow',
 'skateboard',
 'tiger',
 'umbrella',
 'violin',
 'watch',
 'zebra']

In [7]:
import random
import tqdm

%store -r strat_a_convs, strat_a_convs_sf, strat_a_paras

if not strat_a_convs or not strat_a_convs_sf or not strat_a_paras:
    strat_a_convs = []
    strat_a_convs_sf = []
    strat_a_paras = []
    for idx, noun in enumerate(tqdm.tqdm(nouns_list)):
        length = random.randrange(2, 8)
        strat_a_convs.append(gen(
            "Please generate an example conversation between participant1 and participant2 about {}. Make it last {} utterances and have a follow-up. Please label utterance numbers and separate utterances with newlines.".format(
                noun, length
            )
        , preview=idx==0))
        strat_a_convs_sf.append(gen(
            """Please generate an example conversation between participant1 and participant2 about {}. Make it last {} utterances and have a follow-up. Follow this format:
            ```
            1. participant1: text
            2. participant2: text
            3. participant1: text
            4. participant2: text
            5. participant1: text (follow-up)
            ```""".strip().format(
                noun, length
            )
        , preview=idx==0))
        strat_a_paras.append(gen(
            "Please generate a paragraph about {}. Make it last {} sentences.".format(
                noun, length
            )
        , preview=idx==0))
    
%store strat_a_convs strat_a_convs_sf strat_a_paras

no stored variable or alias strat_a_convs,
no stored variable or alias strat_a_convs_sf,
no stored variable or alias strat_a_paras


NameError: name 'strat_a_convs' is not defined

In [None]:
print(strat_a_convs[2])

In [None]:
# INTERESTING: Does proper chat format matter for embeddings?
# TODO: Evaluate taking just last layer vs. all layers for embeddings
# TODO: Evaluate averaging all tokens vs. last token for embeddings
def embed(text):
    duration_start = time.perf_counter()
    prompt = "<|user|>\n{} <|end|>\n<|assistant|>".format(text)
    tokens = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    outputs = model(tokens, output_hidden_states=True)
    embedding = outputs.hidden_states[-1].squeeze(0).mean(dim=0)
    embedding_cpu = embedding.to("cpu").detach()
    del tokens, outputs, embedding
    return embedding_cpu


embed(strat_a_convs[0])

In [None]:
strat_a_embeds = torch.stack([embed(conv) for conv in strat_a_convs]).float().numpy()
strat_a_embeds.shape

In [None]:
from collections import defaultdict

import numpy as np
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(strat_a_embeds)
kmeans.labels_

clusters = defaultdict(list)
for label, conv in zip(kmeans.labels_, strat_a_convs):
    clusters[label].append(conv)

In [None]:
for label, convs in clusters.items():
    print(label)
    for i in range(2):
        if i < len(convs):
            print(convs[i].replace("\n\n", "\n"))
            print("==")
    print("\n" * 5)

In [None]:
from difflib import SequenceMatcher


def find_closest(conv_idx):
    sm = SequenceMatcher(None, "", strat_a_convs[conv_idx])
    strat_a_diff_match = []
    for i, conv in enumerate(strat_a_convs):
        if i != conv_idx:
            sm.set_seq1(conv)
            strat_a_diff_match.append((sm.ratio(), conv))

    print(strat_a_convs[conv_idx].replace("\n\n", "\n"))
    print("==\n\n\n")
    for ratio, conv in sorted(strat_a_diff_match, reverse=True)[:3]:
        print(ratio)
        print(conv.replace("\n\n", "\n"))
        print("==\n\n\n")


find_closest(5)

In [None]:
sm = SequenceMatcher(None, strat_a_convs[0], strat_a_convs[1])
sm.get_matching_blocks()

In [None]:
from termcolor import colored

def show_match(conv_a, conv_b):
    sm = SequenceMatcher(None, conv_a, conv_b, autojunk=False)
    # sm.set_seqs(conv_a, conv_b)
    matches = sm.get_matching_blocks()
    # print(matches)

    for match_prop, conv in enumerate([conv_a, conv_b]):
        idx = 0
        for match_idx, match in enumerate(matches):
            print(conv[idx:match[match_prop]], end='')  # before
            idx = match[match_prop] + match.size
            print(colored(conv[match[match_prop]:idx], 'blue' if match_idx % 2 == 0 else 'light_blue'), end='')  # match
        print('\n===\n')

show_match('apple dog cat', 'apple cat')
show_match(strat_a_convs[0], strat_a_convs[1])

In [None]:
gen('''
Here is an example of a format:
```{}```

Please apply this format to the following text content:
```{}```
'''.strip().format(strat_a_convs[0], strat_a_convs[1]))

In [None]:
print(len(strat_a_convs))
print(len(strat_a_convs_sf))
print(len(strat_a_paras))


In [None]:
import matplotlib.pyplot as plt
import numpy as np

total_arr = strat_a_convs + strat_a_convs_sf + strat_a_paras
total_len = len(total_arr)
matrix = np.zeros((total_len, total_len))

for j in range(total_len): # Slightly faster to have j in outer loop because SequenceMatcher precomputes seq2
    for i in range(total_len):
        sm = SequenceMatcher(None, total_arr[i], total_arr[j], autojunk=False)
        matrix[i,j] = sm.ratio()

plt.imshow(matrix, cmap='hot', interpolation='nearest')
plt.show()

In [None]:
def exmatrix(i, j):
    print(matrix[i, j])
    print('===')
    show_match(total_arr[i], total_arr[j])
    
exmatrix(0, 41)
exmatrix(0, 42)

Observations:

    - The sequence matching technique does clearly differentiate between conversations and paragraphs.
    - But visually, doesn't seem to differentiate too well between the two different formats of conversation. Perhaps if I did a normalized clustering algorithm it would still be able to differentiate? Not sure.
    - Looking at examples, it seems like random word piece matches are significantly contributing. And it doesn't catch e.g. repeating numbers.
    - I know from trying with an LLM that it can "style transfer" one text to another, and it matches what I as a human expect.
    - This makes me think it's more promising to look into pulling the style information out of an LLM, because I suspect it has an implicit understanding of what "style" means from a human perspective.
    - As a backup, I can try making manual features to quantify the style of a text. But it's VERY hard to imagine that in this day and age that will produce better performance per hour worked than spending that time finding a way to extract the implicit information from an LLM, especially now that I can work at scale.

# Strategy A.1: Use LLM to quantify the style of a text ("Stylometry fingerprint")

In [None]:
gen("Consider the abstract style of this text. Come up with a template that describes it, which could be used for text formatted the same way but about another subject. \n\n ```{}```".format(strat_a_convs[0]))

In [None]:
sample = strat_a_convs[7]
print(sample)
print('======')

_ = gen("""
Considering the style and format of the text, come up with an abstract template describing it. Consider these examples:

<example>
Text:
```
Participant1: I just bought a fresh, juicy apple from the farmer's market. It's so crisp and sweet! (1)

Participant2: That sounds delicious! I've been meaning to try a new apple variety. What kind did you get? (2)

Participant1: I got a Honeycrisp. It's my new favorite! Would you like to try one? (Follow-up)
```

Abstract Template:
```
Participant1: Excited text. (1)
Participant2: Excited text. (2)
...
Participant1: Excited text. (Follow-up)
```

</example>

<example>
Text:
```
1. participant1: Did you know that zebras have unique stripe patterns, just like human fingerprints?
2. participant2: Really? I had no idea. How do these stripes help them?
3. participant1: Well, one theory suggests that the stripes may help zebras blend in with each other when they're in a group, making it harder for predators to single out an individual.
4. participant2: That's fascinating! Are there different types of zebras?
5. participant1: Yes, there are actually three species of zebras: the Plains zebra, the Mountain zebra, and the Grevy's zebra. Each has its own distinct characteristics and habitats.
```

Abstract Template:
```
1. participant1: Excited text.
2. participant2: Excited text.
...
N. participant1: Excited text.
```
</example>

<example>
Text:
```
Apples are one of the most popular fruits worldwide, known for their crisp texture and sweet-tart flavor. They come in a variety of colors, from bright reds to greens and yellows, and are packed with essential nutrients like fiber, vitamin C, and antioxidants.
```

Abstract Template:
```
Factual text.
```
</example>

Text:
```
{}
```
""".strip().format(sample))

In [None]:
_ = gen('What are the stylistic similarities between these two texts? \n ```\n{}\n``` \n\n ```\n{}\n```'.format(strat_a_convs[0], strat_a_convs[1]))

In [None]:
_ = gen('What are the stylistic similarities between these two texts? \n ```\n{}\n``` \n\n ```\n{}\n```'.format(strat_a_convs[0], strat_a_convs_sf[0]))

In [None]:
_ = gen('What are the stylistic similarities between these two texts? \n ```\n{}\n``` \n\n ```\n{}\n```'.format(strat_a_convs[0], strat_a_paras[0]))

In [None]:
# INTERESTING: Does proper chat format matter for embeddings?
def format_embed(text, mean_layers=False, mean_tokens=False, prompt_prefix=''):
    duration_start = time.perf_counter()
    if prompt_prefix:
        prompt = "<|user|>\n{}\n```\n{}\n``` <|end|>\n<|assistant|>".format(prompt_prefix, text)
    else:
        prompt = "<|user|>\n{} <|end|>\n<|assistant|>".format(text)
    tokens = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    outputs = model(tokens, output_hidden_states=True)
    embedding = outputs.hidden_states
    # print(len(embedding), embedding[0].shape)
    if mean_layers:
        # print(torch.stack(embedding).shape)
        embedding = torch.stack(embedding).mean(dim=0) # Mean layers
    else:
        embedding = embedding[-1] # Take last layer
        
    # print(embedding.shape)
    
    if mean_tokens:
        embedding = embedding.mean(dim=1) # Mean tokens
    else:
        embedding = embedding[:,-1,:] # Take last token
        
    # print(embedding.shape)
    
    embedding = embedding[0] # Take first and only element of batch
    
    # print(embedding.shape)
    
    embedding_cpu = embedding.to("cpu").detach()
    del tokens, outputs, embedding
    return embedding_cpu

format_embed(strat_a_convs[0])
format_embed(strat_a_convs[0], mean_layers=True)
format_embed(strat_a_convs[0], mean_tokens=True)
format_embed(strat_a_convs[0], mean_layers=True, mean_tokens=True)

In [None]:
from sklearn.decomposition import PCA
import pandas as pd
import altair as alt


def show_format_embeds(datasets, embed_func):
    didxs = []
    names = []
    nouns = []
    texts = []
    embeds = []
    for didx, dataset in enumerate(datasets):
        for cidx, conv in enumerate(dataset):
            didxs.append(didx)
            names.append('{}-{}'.format(didx, cidx))
            nouns.append(nouns_list[cidx])
            texts.append(conv)
            embeds.append(embed_func(conv))
            
    pca = PCA(n_components=2)
    pca_embeds = pca.fit_transform(torch.stack(embeds).float().numpy())
    
    print(len(didxs), len(nouns_list))
    
    data = pd.DataFrame({
        'Dataset': didxs,
        'Sample': names,
        'Noun': nouns,
        'Text': texts,
        'PCA_0': map(lambda p: p[0], pca_embeds),
        'PCA_1': map(lambda p: p[1], pca_embeds),
    })
    
    display(data)
    
    display(alt.Chart(data).mark_point(size=50).encode(
        x='PCA_0',
        y='PCA_1',
        color='Noun:N',
        shape='Dataset:N',
        tooltip=['Dataset', 'Sample', 'Noun', 'Text']
    ).interactive())
              
show_format_embeds([strat_a_convs, strat_a_convs_sf, strat_a_paras], lambda conv: format_embed(conv))
show_format_embeds([strat_a_convs, strat_a_convs_sf, strat_a_paras], lambda conv: format_embed(conv, prompt_prefix='What is the format of this text?'))
show_format_embeds([strat_a_convs, strat_a_convs_sf, strat_a_paras], lambda conv: format_embed(conv, prompt_prefix='What noun is mentioned most frequently in this text?'))

In [None]:
# show_format_embeds([strat_a_convs], lambda conv: format_embed(conv))
show_format_embeds([strat_a_convs], lambda conv: format_embed(conv, prompt_prefix='What is the format/syntactic structure/template of this text? Which tokens are recurring?'))

In [None]:
# Perhaps a heatmap of "distance" in the latent space would be better than doing PCA? Because depending on the dimensions chosen, this may or may not indicate that there is any grouping.
# And the ultimate test would of course be whether there's enough information to train a classifier, like a decision tree/SVM/perceptron.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn

def embed_matrix(datasets, embed_func):
    total_arr = []
    for dataset in datasets:
        total_arr.extend(dataset)
    total_len = len(total_arr)
    total_embeds = [embed_func(arr_item) for arr_item in tqdm.tqdm(total_arr)]

    matrix = np.zeros((total_len, total_len))
    cos = nn.CosineSimilarity(dim=0)

    for i in range(total_len):
        for j in range(total_len):
            matrix[i,j] = cos(total_embeds[i], total_embeds[j])

    plt.imshow(matrix, cmap='hot', interpolation='nearest')
    plt.show()
    
embed_matrix([strat_a_convs, strat_a_convs_sf, strat_a_paras], lambda conv: format_embed(conv))

In [None]:
embed_matrix([strat_a_convs_sf, strat_a_convs, strat_a_paras], lambda conv: format_embed(conv, prompt_prefix='What is the format/syntactic structure/template of this text? Which tokens are recurring?'))

In [None]:
embed_matrix([strat_a_convs_sf, strat_a_convs, strat_a_paras], lambda conv: format_embed(conv, prompt_prefix='What is the format/structure of this text?'))

In [None]:
embed_matrix([strat_a_convs, strat_a_convs_sf, strat_a_paras], lambda conv: format_embed(conv, prompt_prefix=strat_a_convs[0]))

Interesting results!

- Why doesn't the diagonal line show up as pure white? Since I'm pre-computing the embeddings there can't be non-determinism, it's literally cosine similarity of two exactly equal vectors right? Debug my implementation here!!
- Still plenty of mechanisms to try. Cosine similarity might not be the right measure to use with Phi-3. Try other measures of similarity, and try other models that are specifically designed for embeddings.

Ideas:

- It would be really useful to be able to point to WHICH tokens are most similar in the UI. So maybe want to do this on a token-by-token basis, or word part by word part basis, then do sequence matching to find the most similar subsequences? Finding units that repeat with any approximate distance N between them (where N can equal 1, covering the adjacent case) are "interesting" matches? But for the format examples I'm thinking about, I would want exact match more than semantic match right?
- Still try building the UI with just these embeddings. They're not crystal clear, but I don't think I'll know for sure until I try a K-Means clustering algorithm on them to be honest.

Taking a step back:

- I also wonder if I shouldn't think of this pairwise. Maybe any method of computing a pair will not work because all elements share some random words in common? How do you know if those are the relevant pattern? Well a human effectively can't with just two examples anyway! But rather -- is the set of elements samples A and B have in common the SAME as what A and C have in common? If so, group them!!!
  - Custom K-Means with a distance function and a centroid-computing function that are both simply vector addition? But need to figure out how to get the 0 and 1 vectors. I guess it would just be K-Means on pairs of embedded vectors, where clusters have to have dimensions that are strongly all 0s.

In [None]:
gen('''Suppose I have this utterance:

```
Utterance Number: 1
Speaker: John
Start Time: 00:00:05
Text: "Good morning, everyone! I hope you're all doing well today."
```

Give me 10 different ways to format that information on a single line. The utterance text MUST be exactly the same as the example.'''.strip())

In [152]:
DS2_FORMATS = [
    '{speaker} ({time}): "{text}"',
    'Speaker: {speaker}, Time: {time}, Message: "{text}"',
    'Utterance {idx} - {speaker} ({time}): "{text}"',
    '{speaker} ({time}): "{text}" Utterance Number: {idx}',
    '"{text}" - {speaker} ({time}), Utterance {idx}',
    '{speaker} ({time}): "{text}" - Utterance {idx}',
    'Utterance {idx} - {speaker} ({time}): "{text}"',
    '"{text}" - {speaker} ({time}), Utterance {idx}',
    '{speaker} ({time}): "{text}" - utt# {idx}',
    '({time}) {idx}. {speaker}: "{text}"',
    '{idx}. {speaker}: "{text}" ({time})',
    '[{speaker} at {time}] {text}',
]

In [None]:
gen('''Please generate 8 conversations. Each conversation should be between participant1 and participant2. Return each conversation in this format:

```
Conversation 1:
[
  {"speaker": "participant1",
```

In [None]:
BIG_PROMPTS = [
    
    
]

%store -r sabig

if not sabig:
    for 

# Strategy B: Randomly perturb words in the input

# Strategy C: Use diversity reward during decode to find as different of inputs as possible