In [None]:
from word2vec.loader import Word2VecLoader

from sklearn.manifold import TSNE
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

In [None]:
w2v = Word2VecLoader()

In [None]:
w2v.load_vectors_from_file('wiki-news-300d-1M.vec', keep_original=True)

In [None]:
w2v.get_vector('boy')

In [None]:
w2v.get_norm_vector('boy')

In [None]:
w2v.get_most_similar(positive=['China', 'London'], negative=['Beijing'])

In [None]:
keys = ['cat', 'dog', 'coronavirus', 'NLP', 'food', 'China']

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word in w2v.get_most_similar(positive=[word], k=10)[0]:
        words.append(similar_word)
        embeddings.append(w2v.get_vector(similar_word).numpy())
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, _, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=[color], alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar words from FastText', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')