In [1]:
import copy
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer

ModuleNotFoundError: No module named 'transformers'

In [7]:
import os
from dotenv import load_dotenv

## access to API key
 
notebook_directory = os.getcwd()

# Construct the absolute path to the api.env file
env_file_path = os.path.join(notebook_directory, "api.env")

# Load environment variables from the api.env file
load_dotenv(env_file_path)

# Access the API key
api_key = os.getenv("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY'] = api_key

In [None]:
# Initialize the model and tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large") # "*-medium" is also possible
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

In [None]:
# hook some activation layer
def hook(layer, k, mem=None):
    if mem is None: mem = {}
    def f(module, input, output):
        mem[k] = output
    layer.register_forward_hook(f)
    return mem

def hooked(model):
    m = copy.deepcopy(model)
    mem = hook(m.transformer.ln_f, 0)
    return mem, m

In [None]:
mem, m = hooked(model)

In [None]:
def get_embedding(text):
    inputs = tokenizer.encode(text, return_tensors="pt")
    token = m(inputs).logits[:, -1, :].argmax()
    print(tokenizer.decode(token))
    return mem[0][:, -1, :]

In [None]:
get_embedding('bananas')

,


tensor([[-0.0732,  0.5275, -0.7043,  ...,  0.5194,  0.0993,  1.1496]],
       grad_fn=<SliceBackward0>)

In [None]:
sentences = [
    "This is a burgundy women's blouse featuring a sheer chiffon fabric, with a flowing silhouette, long sleeves, and a V-neckline, likely contemporary and versatile for casual to semi-formal events.",
    "This is a maroon, long-sleeved, round-neck blouse in a relaxed fit, likely made of a smooth, lightweight fabric, with a subtle high-low hem and a minimalistic design that seems versatile enough to be styled for both casual and business casual looks.",
    "This is a classic, long-sleeved white button-up shirt featuring a pointed collar, fitted silhouette, and appears to be made from a lightweight cotton blend, suitable for a professional or formal setting.",
    "This is a black suede peep-toe ankle boot featuring a chunky stacked heel, detailed with playful fringe along the side, a stylish zipper closure, evoking a boho-chic feel that is versatile for multiple seasons.",
]

sentences = [
    "the cat is brown",
    "the dog is black",
    "the teapot is hot",
    "the coffee machine is warm",
    "diplomacy",
    "super bowl",
]

# sentences = [
#     "A Queen is a woman who is",
#     "A King is a man who is",
#     "The sleepy fox is",
# ]

# prompts = [f'This sentence: “ {s} ” means in one word: “' for s in sentences]
# prompts = [f'This sentence: " {s} " means in one word: " ' for s in sentences]
prompts = [f'{s}' for s in sentences]
embeddings = [get_embedding(p).detach() for p in prompts]
embeds = torch.stack(embeddings).view(len(embeddings), -1)

 and
 and
.
.
 is
<|endoftext|>


In [4]:
sentences = [
    "the cat is brown",
    "the dog is black",
    "the teapot is hot",
    "the coffee machine is warm",
    "diplomacy",
    "super bowl",
]

In [5]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

embs = []
for s in sentences:
   embs.append(get_embedding(s, model='text-embedding-3-small'))

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: api_key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
# Normalize each embedding (this is necessary for cosine similarity)
norms = torch.norm(embeds, dim=1, keepdim=True)
normalized_embeddings = embeds / norms

# Compute the cosine similarity matrix
cosine_sim_matrix = torch.mm(normalized_embeddings, normalized_embeddings.t())
# Now, cosine_sim_matrix[i, j] is the cosine similarity between embeddings[i] and embeddings[j]

cosine_sim_matrix

tensor([[1.0000, 0.9033, 0.7629, 0.8085, 0.6284, 0.6555],
        [0.9033, 1.0000, 0.7575, 0.8111, 0.6346, 0.6589],
        [0.7629, 0.7575, 1.0000, 0.8684, 0.6175, 0.6466],
        [0.8085, 0.8111, 0.8684, 1.0000, 0.6772, 0.7009],
        [0.6284, 0.6346, 0.6175, 0.6772, 1.0000, 0.6216],
        [0.6555, 0.6589, 0.6466, 0.7009, 0.6216, 1.0000]])