Import libraries and load the hugging face token from the environment

In [1]:
import os

import huggingface_hub
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import paraphrase_mining
import pandas as pd

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

Log in to hugging face and download and initialize the model. It automatically caches it, so it won't re-download unless it doesn't find it in the cache.

In [2]:
huggingface_hub.login(token=os.environ["HF_TOKEN"])

model = SentenceTransformer("google/embeddinggemma-300m")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Compute embeddings and similarities. Can also use `util.dot_score` but `model.similarity` does the same thing. actually worth noting that `model.similarity` uses cosine similarity by default, not dot product, though it can use different functions:
> The similarity metric that is used is stored in the SentenceTransformer instance under SentenceTransformer.similarity_fn_name. Valid options are:
>
> - SimilarityFunction.COSINE (a.k.a “cosine”): Cosine Similarity (default)
> - SimilarityFunction.DOT_PRODUCT (a.k.a “dot”): Dot Product
> - SimilarityFunction.EUCLIDEAN (a.k.a “euclidean”): Negative Euclidean Distance
> - SimilarityFunction.MANHATTAN (a.k.a. “manhattan”): Negative Manhattan Distance



In [3]:
sentences = [
    "tall",
    "short",
    "towering",
    "tiny",
    "baseball",
    "basketball",
    "cat poster",
]

embeddings = model.encode(sentences)

similarities = model.similarity(embeddings[0], embeddings)
print(f"{sentences[0]} similarities to:")

for index, score in enumerate(similarities[0]):
    print(f"{sentences[index]}: {score}")

tall similarities to:
tall: 1.0
short: 0.6037872433662415
towering: 0.7457533478736877
tiny: 0.57103431224823
baseball: 0.5436896681785583
basketball: 0.6031945943832397
cat poster: 0.3962147831916809


try comparing them all to each other and making it a pandas dataframe for labels

In [7]:
sentences = [
    "tall",
    "short",
    "towering",
    "tiny",
    "baseball",
    "basketball",
    "cat poster",
]

embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)

df = pd.DataFrame(data=similarities, index=sentences, columns=sentences)
df

Unnamed: 0,tall,short,towering,tiny,baseball,basketball,cat poster
tall,1.0,0.603787,0.745753,0.571034,0.54369,0.603195,0.396215
short,0.603787,1.0,0.514112,0.591454,0.54697,0.58699,0.352861
towering,0.745753,0.514112,1.0,0.556181,0.539553,0.562235,0.421939
tiny,0.571034,0.591454,0.556181,1.0,0.524438,0.535622,0.414646
baseball,0.54369,0.54697,0.539553,0.524438,1.0,0.776411,0.435858
basketball,0.603195,0.58699,0.562235,0.535622,0.776411,1.0,0.463468
cat poster,0.396215,0.352861,0.421939,0.414646,0.435858,0.463468,1.0


Search for most similar pairs

In [8]:
sentences = [
    "tall",
    "short",
    "towering",
    "tiny",
    "baseball",
    "basketball",
    "cat poster",
    "friendship",
    "man",
    "woman",
    "french",
    "tissue",
    "TV",
    "book",
    "bowl",
    "remote",
    "cup",
]

paraphrases = paraphrase_mining(model, sentences)

for paraphrase in paraphrases[:10]:
    score, i, j = paraphrase
    print(f"{score:.4f}: {sentences[i]} and {sentences[j]}")

0.7764: baseball and basketball
0.7564: bowl and cup
0.7458: tall and towering
0.7324: basketball and bowl
0.7167: man and woman
0.7067: baseball and bowl
0.6700: basketball and cup
0.6290: TV and bowl
0.6254: basketball and TV
0.6230: french and book


Retrieve the tokenizer vocabulary

In [None]:
tokenizer = model[0].tokenizer
vocab = list(tokenizer.get_vocab().keys())
vocab

How does it handle spelling variants? ...turns out it handles them well!

In [9]:
sentences = [
    "center",
    "centre",
    "acknowledgement",
    "acknowledgment",
    "aluminium",
    "aluminum",
    "apologise",
    "apologize",
    "armour",
    "armor",
]

embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)

df = pd.DataFrame(data=similarities, index=sentences, columns=sentences)
df

Unnamed: 0,center,centre,acknowledgement,acknowledgment,aluminium,aluminum,apologise,apologize,armour,armor
center,1.0,0.965872,0.438927,0.432333,0.414315,0.432154,0.37066,0.387622,0.484073,0.525622
centre,0.965872,1.0,0.451336,0.435852,0.42035,0.423102,0.382979,0.389248,0.48754,0.516287
acknowledgement,0.438927,0.451336,1.0,0.98696,0.430719,0.432185,0.594707,0.588531,0.443998,0.49865
acknowledgment,0.432333,0.435852,0.98696,1.0,0.428793,0.43713,0.570335,0.567112,0.442713,0.495369
aluminium,0.414315,0.42035,0.430719,0.428793,1.0,0.982919,0.342098,0.358222,0.553751,0.604719
aluminum,0.432154,0.423102,0.432185,0.43713,0.982919,1.0,0.348393,0.375043,0.575392,0.632767
apologise,0.37066,0.382979,0.594707,0.570335,0.342098,0.348393,1.0,0.979176,0.387751,0.439148
apologize,0.387622,0.389248,0.588531,0.567112,0.358222,0.375043,0.979176,1.0,0.406967,0.467883
armour,0.484073,0.48754,0.443998,0.442713,0.553751,0.575392,0.387751,0.406967,1.0,0.95402
armor,0.525622,0.516287,0.498651,0.495369,0.604719,0.632767,0.439148,0.467883,0.95402,1.0


In [10]:
paraphrases = paraphrase_mining(model, sentences)

for paraphrase in paraphrases[:10]:
    score, i, j = paraphrase
    print(f"{score:.4f}: {sentences[i]} and {sentences[j]}")

0.9870: acknowledgement and acknowledgment
0.9829: aluminium and aluminum
0.9792: apologise and apologize
0.9659: center and centre
0.9540: armour and armor
0.6328: aluminum and armor
0.6047: aluminium and armor
0.5947: acknowledgement and apologise
0.5885: acknowledgement and apologize
0.5754: aluminum and armour


let's explore the oxford 3000

In [19]:
with open("American_Oxford_3000.txt", "r") as file:
    oxford_3000 = file.read().splitlines()

oxford_3000

['a',
 'an',
 'abandon',
 'ability',
 'able',
 'about',
 'above',
 'abroad',
 'absolute',
 'absolutely',
 'academic',
 'accept',
 'acceptable',
 'access',
 'accident',
 'accompany',
 'according to',
 'account',
 'accurate',
 'accuse',
 'achieve',
 'achievement',
 'acknowledge',
 'acquire',
 'across',
 'act',
 'action',
 'active',
 'activity',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adapt',
 'add',
 'addition',
 'additional',
 'address',
 'administration',
 'admire',
 'admit',
 'adopt',
 'adult',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'advertise',
 'advertisement',
 'advertising',
 'advice',
 'advise',
 'affair',
 'affect',
 'afford',
 'afraid',
 'after',
 'afternoon',
 'afterward',
 'again',
 'against',
 'age',
 'agency',
 'agenda',
 'agent',
 'aggressive',
 'ago',
 'agree',
 'agreement',
 'ah',
 'ahead',
 'aid',
 'aim',
 'air',
 'aircraft',
 'airline',
 'airport',
 'alarm',
 'album',
 'alcohol',
 'alcoholic',
 'alive',
 'all',
 'all right',
 'allow',
 'al

In [20]:
paraphrases = paraphrase_mining(model, oxford_3000)

this shows us that different parts of speech are also rated as very similar, not just spelling variants

In [21]:
for paraphrase in paraphrases[:20]:
    score, i, j = paraphrase
    print(f"{score:.4f}: {oxford_3000[i]} and {oxford_3000[j]}")

0.9819: transport and transportation
0.9786: concern and concerned
0.9768: dance and dancing
0.9744: humor and humorous
0.9741: threat and threaten
0.9727: surprised and surprising
0.9723: invest and investment
0.9722: political and politics
0.9719: laugh and laughter
0.9717: success and successful
0.9716: surprise and surprising
0.9715: percent and percentage
0.9711: exhibit and exhibition
0.9700: embarrassed and embarrassing
0.9698: television and TV
0.9692: surprise and surprised
0.9691: thank and thanks
0.9684: feel and feeling
0.9681: work and working
0.9680: disappointed and disappointing


i assume the story isn't any different with the 5000....yep, parts of speech are close to each other

In [26]:
with open("American_Oxford_3000.txt") as file:
    oxford_3000 = file.read().splitlines()
with open("American_Oxford_5000.txt") as file:
    oxford_5000 = file.read().splitlines()

oxford_5000 = oxford_3000 + oxford_5000
oxford_5000 = [word.lower() for word in oxford_5000]
oxford_5000 = list(set(oxford_5000))

paraphrases = paraphrase_mining(model, oxford_5000)

for paraphrase in paraphrases[:20]:
    score, i, j = paraphrase
    print(f"{score:.4f}: {oxford_5000[i]} and {oxford_5000[j]}")

0.9885: disappointing and disappoint
0.9819: transport and transportation
0.9800: disappointed and disappoint
0.9786: concern and concerned
0.9775: anxiety and anxious
0.9773: embarrassing and embarrassment
0.9768: dance and dancing
0.9753: agriculture and agricultural
0.9744: donation and donate
0.9744: humor and humorous
0.9741: threat and threaten
0.9727: surprising and surprised
0.9723: investment and invest
0.9722: political and politics
0.9721: embarrassment and embarrassed
0.9719: laughter and laugh
0.9717: successful and success
0.9716: surprising and surprise
0.9715: percent and percentage
0.9711: exhibit and exhibition


let's measure some similarities against the oxford 5000

In [37]:
sentences = [
    "tall",
    "short",
    "towering",
    "tiny",
    "baseball",
    "basketball",
    "cat poster",
    "k-pop: demon hunter"
]

sentences_embeddings = model.encode(sentences)
oxford_5000_embeddings = model.encode(oxford_5000)
similarities = model.similarity(oxford_5000_embeddings, sentences_embeddings)

df = pd.DataFrame(data=similarities, index=oxford_5000, columns=sentences)
df

Unnamed: 0,tall,short,towering,tiny,baseball,basketball,cat poster,k-pop: demon hunter
reporter,0.480071,0.516974,0.448286,0.453084,0.529671,0.534547,0.338920,0.328210
medium,0.509781,0.518548,0.480742,0.538568,0.531540,0.560842,0.441819,0.313984
insertion,0.476257,0.504075,0.514273,0.516791,0.515696,0.543627,0.374707,0.320129
lane,0.536695,0.522634,0.573151,0.565615,0.587027,0.600137,0.422186,0.395254
ever,0.537972,0.532212,0.543449,0.542116,0.606398,0.620119,0.412994,0.375025
...,...,...,...,...,...,...,...,...
photograph,0.525835,0.524351,0.551870,0.547570,0.550596,0.574350,0.527521,0.372931
mercy,0.536208,0.529321,0.522877,0.573929,0.592988,0.633246,0.434403,0.373971
dream,0.565285,0.551067,0.554877,0.590610,0.589437,0.587224,0.457107,0.417472
confess,0.401994,0.468296,0.446281,0.483429,0.520968,0.516197,0.406570,0.349524


In [38]:
df.sort_values("k-pop: demon hunter", ascending=False).head(10)

Unnamed: 0,tall,short,towering,tiny,baseball,basketball,cat poster,k-pop: demon hunter
demon,0.508911,0.532366,0.568567,0.537796,0.582338,0.590081,0.449394,0.600794
singer,0.519338,0.489772,0.476585,0.49234,0.558919,0.538812,0.406942,0.554147
devil,0.505224,0.513279,0.55827,0.544426,0.606784,0.603446,0.428363,0.543682
hunt,0.555002,0.537045,0.534461,0.555091,0.59495,0.620769,0.484303,0.534947
song,0.498248,0.524418,0.496971,0.523291,0.5588,0.540647,0.383907,0.526658
music,0.488081,0.50332,0.478741,0.532498,0.579212,0.576172,0.412197,0.516504
pop,0.455482,0.471986,0.490233,0.512559,0.521811,0.538571,0.443407,0.508152
chase,0.505847,0.521947,0.567516,0.543573,0.583834,0.608333,0.505885,0.502636
dancer,0.600887,0.560551,0.56787,0.545687,0.558464,0.635308,0.454879,0.502603
monster,0.555564,0.540075,0.603281,0.559371,0.62117,0.619114,0.45653,0.501834


In [39]:
df['k-pop: demon hunter'].sort_values(ascending=False)[:25]

demon            0.600794
singer           0.554147
devil            0.543682
hunt             0.534947
song             0.526658
music            0.516504
pop              0.508152
chase            0.502636
dancer           0.502603
monster          0.501834
haunt            0.494987
predator         0.490669
fantasy          0.489724
fan              0.485427
defender         0.481781
musician         0.480738
hunting          0.479617
genre            0.479289
boy              0.476963
monk             0.474900
ghost            0.474322
lyric            0.473135
entertainment    0.472714
among            0.472252
beast            0.471684
Name: k-pop: demon hunter, dtype: float32

try the csv version, so we can filter by part of speech and level