In [1]:
! pip install sentence-transformers tiktoken



In [2]:
from sentence_transformers import SentenceTransformer
import tiktoken
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load a small local embedding model (runs fully offline)
model = SentenceTransformer("all-MiniLM-L6-v2")  # ~22MB, fast and accurate

In [4]:
# Use tiktoken for GPT-style tokenization (optional)
encoding = tiktoken.get_encoding("cl100k_base")

In [5]:
# Example text
text = "ChatGPT is great at explaining things clearly!"

In [6]:
# Tokenize (optional step â€” just to show token IDs)
tokens = encoding.encode(text)
print("Tokens:", tokens)
print("Number of tokens:", len(tokens))

Tokens: [16047, 38, 2898, 374, 2294, 520, 26073, 2574, 9539, 0]
Number of tokens: 10


In [7]:
# Get local embedding
embedding = model.encode(text)

print(f"Embedding vector length: {len(embedding)}")
print(f"First 10 values: {embedding[:10]}")

Embedding vector length: 384
First 10 values: [-0.03646558 -0.0387276   0.05664004 -0.00621239  0.00088993 -0.08437954
  0.05936947  0.05627443  0.00571885  0.01636856]


In [8]:
sent1 = "ChatGPT helps explain complex ideas."
sent2 = "This model is good at clarifying difficult topics."

emb1 = model.encode(sent1)
emb2 = model.encode(sent2)

similarity = cosine_similarity([emb1], [emb2])[0][0]
print(f"Cosine similarity: {similarity:.3f}")

Cosine similarity: 0.439
