<a href="https://colab.research.google.com/github/WenXiaowei/unbiasing_nlp_embeddings/blob/main/EmbeddingsAnalogies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embedding analogies

In [1]:
import gensim
import gensim.downloader as gloader

from pprint import pprint

from scipy.spatial import distance

In [2]:
def load_embeddings(model_type, embedding_dim):
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dim: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""

    # Find the correct embedding model name
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    elif model_type.strip().lower() == 'fasttext':
        download_path = "fasttext-wiki-news-subwords-300"
    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")
    
    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [3]:
# Modify these variables as you wish!
# Glove -> 50, 100, 200, 300
# Word2Vec -> 300
# Fasttext -> 300
embedding_model_type = "glove"
embedding_dimension = 200

# Load the embedding model
embedding = load_embeddings(embedding_model_type, embedding_dimension)

print("Vocabulary size: {}".format(len(embedding.key_to_index)))

Vocabulary size: 400000


### Exploit relation between words
$Paris:France :: Tokyo:x$

In [4]:
# relation reasoning
target_words = ["paris", "france", "tokyo"]

relationship_1 = embedding[target_words[0]] - embedding[target_words[1]]
relationship_2 = [embedding[target_words[2]] - embedding[word] for word in embedding.key_to_index.keys()]

similarities = {i: distance.cosine(relationship_1, rel2) for i, rel2 in enumerate(relationship_2)}

# sort similarities by value
similarities = dict(sorted(similarities.items(), key=lambda x: x[1]))

# remove the target words from the list
similarities.pop(embedding.key_to_index[target_words[2]], None)

print("Most similar:") 
pprint([(embedding.index_to_key[i], value) for i, value in list(similarities.items())[:10]])

  dist = 1.0 - uv / np.sqrt(uu * vv)


Most similar:
[('japan', 0.26387566328048706),
 ('france', 0.39477771520614624),
 ('spain', 0.45271486043930054),
 ('canada', 0.4603247046470642),
 ('slovakia', 0.47672826051712036),
 ('britain', 0.47828537225723267),
 ('germany', 0.47892600297927856),
 ('italy', 0.47972118854522705),
 ('korea', 0.4806209206581116),
 ('romania', 0.48913657665252686)]


### Exploit the most similar vectors to some words

In [5]:
# similarity reasoning
query = embedding["whore"]
similarities = {i: distance.cosine(query, embedding[word]) for i, word in enumerate(embedding.key_to_index.keys())}

# sort similarities by value
similarities = dict(sorted(similarities.items(), key=lambda x: x[1]))

print("Most similar:") 
pprint([(embedding.index_to_key[i], value) for i, value in list(similarities.items())[:10]])

Most similar:
[('whore', 0),
 ('slut', 0.39468371868133545),
 ('bitch', 0.44846659898757935),
 ('hypocrite', 0.4558418393135071),
 ('whores', 0.492634654045105),
 ('fucking', 0.5071277618408203),
 ('junkie', 0.5299773514270782),
 ('bastard', 0.5561682283878326),
 ('liar', 0.5697762668132782),
 ('pimp', 0.5821506381034851)]
