In [None]:
import numpy as np
import os
import re
import random
from scipy.spatial.distance import cdist 
from sklearn.preprocessing import normalize
from gensim import models


## Load in the language model

In [None]:
w2v = models.KeyedVectors.load("word2vec.model")

In [None]:
lower_ascii = re.compile(r"^[a-z]+$")

In [None]:
words = w2v.index_to_key

# semantle uses only lowercase ascii
words = [w for w in words if lower_ascii.match(w)]


Now we create a matrix of all the word vectors that go with these words and we'll renormalize.

In [None]:
vecs = []

for word in words : 
    vecs.append(w2v[word])

vec_mat = np.vstack(vecs)


In [None]:
vec_mat = normalize(vec_mat,axis=1)

In [None]:
vec_mat.shape

## Work with distances

In [None]:
# Both of these functions build off of this: https://github.com/manimino/semantle-crab/blob/main/notebooks/crab.ipynb
# which has the correct semantle distance function

def get_all_dists(target,word_list, matrix):
    idx = word_list.index(target)
    vec = np.array([matrix[idx, :]])
    dists = cdist(vec, matrix, metric='cosine')
    semantle_scores = np.round((1-dists)*100, 1)[0]
    score_list = semantle_scores.tolist()
    return score_list


def pair_dist(word_1, word_2 ,word_list, matrix):
    idx_1 = word_list.index(word_1)
    idx_2 = word_list.index(word_2)
    vec_1 = np.array([matrix[idx_1, :]])
    vec_2 = np.array([matrix[idx_2, :]])
    dist = cdist(vec_1, vec_2, metric='cosine')
    semantle_score = np.round((1-dist)*100, 1)[0].tolist()
    return semantle_score[0]


Feel free to play around with distances and pairs here.

In [None]:
x = get_all_dists("airplane",words,vec_mat)

In [None]:
pair_dist("airplane","fighter",words,vec_mat)

## Guessing

In [None]:
guess = "help"
score = 11.08

Let's iterate through the words and get all the words that are at this distance from the first guess. Set `tolerance` so that you have a couple hundred suggestions.

In [None]:
distances = get_dists(guess,words,vec_mat)
tolerance = 0.02

In [None]:
suggestions = dict()

for idx, dist in enumerate(distances) : 
    if np.abs(dist - score) < tolerance : 
        suggestions[words[idx]] = dist

In [None]:
len(suggestions)

Now that we have suggestions, let's find a few different guesses to try. We'll randomly pick a few words from the "cone"

In [None]:
num_suggestions = 3

In [None]:
for word in list(random.choices(list(suggestions.keys()),k=num_suggestions)) : 
    print(f"Try {word}")

Now take your best-scoring guess and repeat. We'll write a function to help. 

In [None]:
def get_suggestions(word, score, num_suggestions=3,tolerance=0.02,word_list=words,matrix=vec_mat) : 
    distances = get_all_dists(word,word_list,matrix)
    suggestions = dict()

    for idx, dist in enumerate(distances) : 
        if np.abs(dist - score) < tolerance : 
            suggestions[words[idx]] = dist
            
            
    for word in list(random.choices(list(suggestions.keys()),k=num_suggestions)) : 
        print(f"Try {word}")

In [None]:
get_suggestions("fracture",31.81,num_suggestions=10)

Iterate and see if you can get there!