In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Notes:
**Sentence transformer**: https://sbert.net/

**HuggingFace model**: https://huggingface.co/Graphlet-AI/eridu 

In [2]:
# Download from the 🤗 Hub
model = SentenceTransformer("Graphlet-AI/eridu")

In [3]:

names = [
    "Frank Lloyd Right",
    "Jim Jeffries",
    "Russell H. Jurney",
    "Russell Henry Jurney", 
    "Русс Джерни", 
    "Erasmus Rotterodamus",
    "Erasmus van Rotterdam", 
    "Erasme de Rotterdam", 
    "Leonardo da Vinci", 
    "Leonardo di ser Piero da Vinci", 
    "Леонардо да Винчи", 
    "Pieter Bruegel the Elder", 
    "Pieter Bruegel de Oude", 
    "Pieter Bruegel de Oudere", 
    "Pieter Brueghel l'Ancien", 
    "Pieter Brueghel the Younger", 
    "Pieter Brueghel de Jonge", 
    "Pieter Brueghel le Jeune", 
    "Tim Lee", 
    "Timothy Lee", 
    "Tim Leeds", 
    "Pyrreia",    "Φύρεα",     "Pyrria",     'Pyrrea', 'Pyrraia', 'Pyrria', 'Paurreia', 'Pyrea', 'Phyreia', 'Phyrea', 'Πύρρεια', 
    "Ἄβελλα", "Abella", "Ἀβέλλοις"
]


In [4]:
embeddings = model.encode(names)
print(embeddings.shape)
# [3, 384]

(35, 384)


In [5]:
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]


torch.Size([35, 35])


In [6]:
print(similarities.numpy())
# [[1.         0.7203882  0.7773637  0.80202234]
#  [0.7203882  1.0000001  0.7606021  0.829355  ]
#  [0.7773637  0.7606021  0.99999994 0.81948197]
#  [0.80202234 0.829355   0.81948197 1.        ]]

[[0.99999964 0.99433494 0.9871316  ... 0.98407036 0.9755139  0.9704335 ]
 [0.99433494 1.         0.98398656 ... 0.97267413 0.96312916 0.96189106]
 [0.9871316  0.98398656 1.         ... 0.97056973 0.9621057  0.9547693 ]
 ...
 [0.98407036 0.97267437 0.97056973 ... 1.0000002  0.99482596 0.9829719 ]
 [0.9755138  0.9631292  0.9621059  ... 0.99482596 0.9999999  0.9712906 ]
 [0.9704336  0.9618911  0.9547691  ... 0.9829719  0.9712906  1.        ]]


# Similarity comparison:


In [7]:
def compute_name_similarities(names):
    # Load the eridu model (fine-tuned for name matching) :contentReference[oaicite:0]{index=0}
    model = SentenceTransformer("Graphlet-AI/eridu")
    
    # Compute embeddings
    embeddings = model.encode(names, convert_to_tensor=True)
    
    # Compute pairwise similarity matrix (cosine similarities)
    # SentenceTransformer provides a helper `similarity` method
    sim_matrix = model.similarity(embeddings, embeddings)
    # sim_matrix is a PyTorch or numpy tensor with shape (n, n)
    
    # Convert to numpy for easier handling
    sim_np = sim_matrix.cpu().numpy() if hasattr(sim_matrix, "cpu") else sim_matrix
    
    return sim_np

def print_similarities(names, sim_matrix):
    n = len(names)
    print("Similarity matrix (higher = more similar):")
    # Print header
    print("\t" + "\t".join([f"[{i}]" for i in range(n)]))
    for i in range(n):
        row = sim_matrix[i]
        print(f"[{i}] {names[i]}\t" + "\t".join(f"{row[j]:.4f}" for j in range(n)))
    print()
    # Optionally: list the top matches for each name
    for i in range(n):
        sims = list(enumerate(sim_matrix[i]))
        # ignore itself
        sims = [(j, s) for j, s in sims if j != i]
        sims_sorted = sorted(sims, key=lambda x: x[1], reverse=True)
        top_j, top_s = sims_sorted[0]
        print(f"Best match for '{names[i]}' → '{names[top_j]}' with similarity {top_s:.4f}")


In [8]:
sim = compute_name_similarities(names)
print_similarities(names, sim)


Similarity matrix (higher = more similar):
	[0]	[1]	[2]	[3]	[4]	[5]	[6]	[7]	[8]	[9]	[10]	[11]	[12]	[13]	[14]	[15]	[16]	[17]	[18]	[19]	[20]	[21]	[22]	[23]	[24]	[25]	[26]	[27]	[28]	[29]	[30]	[31]	[32]	[33]	[34]
[0] Frank Lloyd Right	1.0000	0.9943	0.9871	0.9952	0.9953	0.9608	0.9773	0.9904	0.9961	0.9952	0.9894	0.9919	0.9956	0.9929	0.9949	0.9932	0.9929	0.9956	0.9819	0.9814	0.9441	0.9705	0.9911	0.9841	0.9881	0.9795	0.9841	0.9826	0.9919	0.9812	0.9913	0.9927	0.9841	0.9755	0.9704
[1] Jim Jeffries	0.9943	1.0000	0.9840	0.9920	0.9938	0.9645	0.9726	0.9874	0.9922	0.9912	0.9825	0.9879	0.9921	0.9888	0.9904	0.9905	0.9906	0.9935	0.9766	0.9764	0.9402	0.9595	0.9844	0.9756	0.9801	0.9689	0.9756	0.9771	0.9840	0.9723	0.9833	0.9866	0.9727	0.9631	0.9619
[2] Russell H. Jurney	0.9871	0.9840	1.0000	0.9972	0.9917	0.9516	0.9597	0.9730	0.9850	0.9830	0.9778	0.9813	0.9847	0.9825	0.9839	0.9841	0.9835	0.9853	0.9865	0.9864	0.9658	0.9553	0.9740	0.9705	0.9731	0.9650	0.9705	0.9655	0.9770	0.9644	0.9748	0.9801	0.9706	0.9621	0.

## Clustering


In [9]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity


In [10]:
def cluster_names(names, threshold=0.75):
    # Load eridu model
    model = SentenceTransformer("Graphlet-AI/eridu")
    embeddings = model.encode(names, normalize_embeddings=True)

    # Convert cosine similarity → distance
    # distance = 1 - similarity
    distance_matrix = 1 - cosine_similarity(embeddings)

    # Agglomerative clustering
    clustering = AgglomerativeClustering(
        metric="precomputed",
        linkage="average",
        distance_threshold=1 - threshold,  # threshold on similarity
        n_clusters=None                   # let it decide based on threshold
    )
    clustering.fit(distance_matrix)

    # Group names by cluster
    clusters = {}
    for idx, label in enumerate(clustering.labels_):
        clusters.setdefault(label, []).append(names[idx])

    return clusters


In [11]:
clusters = cluster_names(names, threshold=0.992)
for i, group in enumerate(clusters.values(), start=1):
    print(f"Group {i}: {group}")

Group 1: ['Frank Lloyd Right', 'Leonardo da Vinci', 'Leonardo di ser Piero da Vinci', 'Леонардо да Винчи']
Group 2: ['Jim Jeffries']
Group 3: ['Russell H. Jurney', 'Russell Henry Jurney', 'Русс Джерни']
Group 4: ['Erasmus Rotterodamus']
Group 5: ['Erasmus van Rotterdam']
Group 6: ['Erasme de Rotterdam']
Group 7: ['Pieter Bruegel the Elder', 'Pieter Bruegel de Oude', 'Pieter Bruegel de Oudere', "Pieter Brueghel l'Ancien", 'Pieter Brueghel the Younger', 'Pieter Brueghel de Jonge', 'Pieter Brueghel le Jeune']
Group 8: ['Tim Lee', 'Timothy Lee']
Group 9: ['Tim Leeds']
Group 10: ['Pyrreia', 'Φύρεα', 'Pyrria', 'Pyrrea', 'Pyrraia', 'Pyrria', 'Paurreia', 'Pyrea', 'Phyreia', 'Phyrea', 'Πύρρεια']
Group 11: ['Ἄβελλα', 'Abella']
Group 12: ['Ἀβέλλοις']
