In [1]:
#  02_hf_lucrarea_inspection.ipynb

#  Goal:
# Analyze the position of "lucrarea" in the HuggingFace implementation of Sentence-T5.
# Compare how it is tokenized and where its embedding lies relative to </s>.

# ---

## Setup: Load Libraries
import numpy as np
from transformers import T5Tokenizer, T5Model
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ---

## Load Model & Tokenizer
model_name = "sentence-transformers/sentence-t5-base"
smodel = SentenceTransformer(model_name)
tokenizer = smodel.tokenizer  # Uses T5Tokenizer internally
t5 = T5Model.from_pretrained(model_name)

# ---



ModuleNotFoundError: No module named 'transformers'

In [None]:
## Analyze Tokenization
text = "lucrarea"
tokens = tokenizer.tokenize(text)
print("Tokenized output for 'lucrarea':", tokens)

# Check if it exists as one token
joined_token = "‚ñÅlucrarea"
if joined_token in tokenizer.get_vocab():
    token_id = tokenizer.get_vocab()[joined_token]
    print(f"'{joined_token}' found in vocab with ID {token_id}")
else:
    print(f"'{joined_token}' not found as a single token in the vocab")

In [None]:
## Get `</s>` embedding
s_embedding = smodel.encode(["</s>"])[0]  # Shape: (768,)

# ---



In [None]:
## Get embedding for each token piece in "lucrarea"
token_ids = tokenizer.convert_tokens_to_ids(tokens)
weights = t5.get_input_embeddings().weight.detach().cpu().numpy()

lucrarea_embed = np.mean([weights[i] for i in token_ids], axis=0)


In [None]:
## Compute similarity with </s>
s_embedding /= np.linalg.norm(s_embedding)
lucrarea_embed /= np.linalg.norm(lucrarea_embed)

similarity = cosine_similarity([lucrarea_embed], [s_embedding])[0][0]
print(f"Cosine similarity between lucrarea and </s>: {similarity:.4f}")