In [None]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# load datasets
names = open("first_name.txt", "r", encoding="utf-8").read().splitlines()

print(names[:10])
print("Total names:", len(names))

device = "cuda" if torch.cuda.is_available() else "cpu"
names = open("first_name.txt").read().splitlines()


['Aachal', 'Aadharsh', 'Aadhavi', 'Aadhira', 'Aadidev', 'Aadil', 'Aadita', 'Aaditya', 'Aadiv', 'Aadrik']
Total names: 2195


In [None]:
#Tokenizer
chars = sorted(list(set("".join(names))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}

vocab_size = len(stoi)
max_len = 20

def encode(name):
    tokens = [stoi[c] for c in name]
    tokens = tokens[:max_len]
    tokens += [0] * (max_len - len(tokens))
    return tokens


# dataset tensors
X = torch.tensor([encode(n) for n in names]).to(device)

In [None]:
# Model → Name → Embedding vector

class NameEncoder(nn.Module):

    def __init__(self):
        super().__init__()

        self.emb = nn.Embedding(vocab_size, 64)
        self.fc = nn.Linear(64, 128)

    def forward(self, x):
        x = self.emb(x).mean(dim=1)
        x = self.fc(x)
        return F.normalize(x, dim=1)

model = NameEncoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Self Supervised Learning
for epoch in range(2000):

    emb = model(X)

    temperature = 0.07
    similarity = (emb @ emb.T) / temperature
    labels = torch.arange(len(X)).to(device)

    loss = F.cross_entropy(similarity, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 200 == 0:
        print("Iterations:", epoch, "Loss:", loss.item())

Iterations: 0 Loss: 0.18973787128925323
Iterations: 200 Loss: 0.17321455478668213
Iterations: 400 Loss: 0.17116330564022064
Iterations: 600 Loss: 0.170314759016037
Iterations: 800 Loss: 0.16987796127796173
Iterations: 1000 Loss: 0.16962167620658875
Iterations: 1200 Loss: 0.16945523023605347
Iterations: 1400 Loss: 0.16933798789978027
Iterations: 1600 Loss: 0.1692500114440918
Iterations: 1800 Loss: 0.16918107867240906


In [None]:
# Similarity Search
def get_similar(name, top_k=5):

    model.eval()

    with torch.no_grad():

        inp = torch.tensor([encode(name)]).to(device)
        emb = model(inp)

        all_emb = model(X)

        sim = (emb @ all_emb.T).squeeze()

        top = torch.topk(sim, top_k).indices

        print("\nSimilar names:\n")
        for i in top:
            print(names[i])

In [None]:
# User Input 

name = input("\nEnter name: ")
get_similar(name)


Similar names:

Ananya
Annanya
Anunay
Ayaan
Aanya
