#### Load embeddings

In [2]:
from datasets import load_dataset
import json

old = load_dataset('parquet', data_dir='./myembeddings/old', split='train')
modern = load_dataset('parquet', data_dir='./myembeddings/modern', split='train')

#### Load Lexicon

In [7]:
with open('./lexicon.json', 'r') as f:
    lexicon = json.load(f)

#### Compute divergence based on cosine similarity and update original lexicon

In [9]:
import torch
import torch.nn.functional as F


for i, word in enumerate(lexicon):
    modern_embedding = modern['embedding'][i]
    old_embedding = old['embedding'][i]
    divergence = 1 - F.cosine_similarity(torch.tensor(modern_embedding), torch.tensor(old_embedding), dim=0)

    word['embeddings'] = {'modern': modern_embedding, 'old': old_embedding, 'divergence': divergence.item()}

#### Sort lexicon based on divergence and print top N (least similar)


In [10]:
sorted_lexicon = sorted(
    lexicon,
    key=lambda x: x["embeddings"]["divergence"],
    reverse=True
)

top20 = sorted_lexicon[:20]

for item in top20:
    print(item["word"], item["embeddings"]["divergence"])

moggy 0.5632590651512146
snakehead 0.5630356073379517
space opera 0.5437947511672974
bolshie 0.5353302359580994
brach 0.5183298587799072
jelly roll 0.49795961380004883
dip stick 0.49476635456085205
moll 0.49410927295684814
po-po 0.48091548681259155
bumf 0.47772216796875
clanker 0.46373075246810913
minute man 0.45648515224456787
mucker 0.4561878442764282
wobbler 0.4495977759361267
tryhard 0.44851404428482056
pissant 0.4459751844406128
foofoo 0.4412076473236084
mong 0.43965673446655273
smearer 0.4365081787109375
pseud 0.43536078929901123


In [12]:
for word in lexicon:
    word['word'] = word['word'].lower()