In [2]:
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import word2vec, Word2Vec

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
datafile = './data/text8'

sentences = word2vec.Text8Corpus(datafile)

In [None]:
model = Word2Vec( vector_size=150, window=10, min_count=5, workers=14)

In [None]:
model.build_vocab(sentences)
model.train(sentences, epochs=10, total_examples=model.corpus_count)

In [None]:
model.save('./data/t8.model')

###### TODO: try both cbow and skipgram algo



In [None]:
wvecs = model.wv

wvecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)[0]

In [None]:
wvecs.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [None]:
wvecs['computer']

In [None]:
wvecs.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
wvecs.most_similar(positive=['cricket', 'tendulkar'], negative = ['australia'])

In [None]:
synonyms = wvecs.most_similar("beautiful")
print("Synonyms for 'beautiful': ", synonyms)

In [None]:
antonym = wvecs.most_similar(positive=["strong", "weak"], negative=["powerful"])
print("Antonym of 'strong': ", antonym)


In [None]:

# Word Similarity
similarity_ranking = wvecs.most_similar("dog")
print("Word similarity ranking: ", similarity_ranking)

similarity_score = wvecs.similarity("computer", "keyboard")
print("Similarity between 'computer' and 'keyboard': ", similarity_score)


In [None]:

# Word Composition
running_relationship = wvecs.most_similar(positive=["ran", "running"], negative=["ran"])
print("Running relationship: ", running_relationship)

talk_relationship = wvecs.most_similar(positive=["talked", "talking"], negative=["talked"])
print("Talking relationship: ", talk_relationship)


In [None]:
# Contextual Understanding

context_words = wvecs.most_similar("science")
print("around 'science': ", context_words)

ocean_context = wvecs.most_similar("ocean")
print("around 'ocean': ", ocean_context)



In [None]:
# Outliers and Oddities
odd_word = wvecs.doesnt_match(["apple", "banana", "cherry", "potato"])
print("Odd word out: ", odd_word)

distant_word = wvecs.doesnt_match(["apple", "banana", "chicken", "potato"])
print("Distant word out: ", distant_word)


In [None]:
from sklearn.manifold import TSNE
import re
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import pandas as pd

model = Word2Vec.load('./data/t8.model')

vocab = list(model.wv.key_to_index)
X = model.wv[vocab]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X[:250,:])

df = pd.DataFrame(X_tsne, index=vocab[:250], columns=['x', 'y'])

fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])

for word, pos in df.iterrows():
    ax.annotate(word, pos)

plt.show()

In [None]:
### also try with umap
from umap import UMAP

umap2d = UMAP(n_components=2, n_epochs=30, n_neighbors=15)

x_umap = umap2d.fit_transform(X[:500, :])
df = pd.DataFrame(x_umap, index=vocab[:500], columns=['x', 'y'])

fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])

for word, pos in df.iterrows():
    ax.annotate(word, pos)

plt.show()

### Visualization part

In [3]:
model = Word2Vec.load('./data/t8.model')
word_vecs = model.wv
embedding_vectors = np.stack(list(word_vecs.vectors[:600]), axis=0)

In [4]:
import tensorflow as tf

tf_w_embeddings = tf.Variable(embedding_vectors, name='word_embeddings')


2023-08-24 17:22:18.471581: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-24 17:22:18.473007: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 17:22:18.499541: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 17:22:18.500919: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
import os
import subprocess

import os
import tensorflow as tf
import numpy as np
import gensim


def save_word_embeddings3(embedding_vectors, word_vecs):
    tf_w_embeddings = tf.Variable(embedding_vectors, name="word_embeddings")
    checkpt_dir = "./checkpoints/"
    log_dir = "./logs/"
    # Create the checkpoints directory if it doesn't exist
    if not os.path.exists(checkpt_dir):
        os.makedirs(checkpt_dir)

    checkpoint_prefix = os.path.join(checkpt_dir, "model.ckpt")
    checkpoint = tf.train.Checkpoint(word_embeddings=tf_w_embeddings)
    checkpoint.save(file_prefix=checkpoint_prefix)

    metadata_path = os.path.join(checkpt_dir, "metadata.tsv")
    with open(metadata_path, "w", encoding="utf-8") as f:
        for word, index in word_vecs.key_to_index.items():
            f.write(word + "\n")

    summary_writer = tf.summary.create_file_writer(log_dir)
    with summary_writer.as_default():
        # Reshape the embeddings to 2D
        reshaped_embeddings = tf.reshape(tf_w_embeddings, [-1, embedding_vectors.shape[1]])

        # Embed the words with labels
        labels_tensor = tf.constant([word for word, index in word_vecs.key_to_index.items()])
        tf.summary.text("labels", labels_tensor, step=0)
        tf.summary.write("embeddings", reshaped_embeddings, step=0)

    tensorboard_command = "tensorboard --logdir=" + log_dir
    subprocess.Popen(tensorboard_command, shell=True)

# Load the Word2Vec model
model = gensim.models.Word2Vec.load('./data/t8.model')
word_vectors = model.wv
embedding_vectors = np.stack(list(word_vectors.vectors), axis=0)

# Save the word embeddings with labels and launch TensorBoard
save_word_embeddings3(embedding_vectors, word_vectors)


2023-08-24 00:30:53.587585: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-24 00:30:53.589059: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 00:30:53.618342: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 00:30:53.618696: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report

#### TensorGrad visualisation

In [5]:
model = Word2Vec.load('./data/t8.model')
word_vecs = model.wv


labels = word_vecs.index_to_key
vectors = [word_vecs[label] for label in labels]

projection_data = [(label, vector) for label, vector in zip(labels, vectors)][:400]

In [6]:
import os
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector
from utils import create_projection


create_projection(projection_data, path='./tf-meta/t8Meta')

2023-08-24 17:22:29.873554: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-24 17:22:29.874920: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 17:22:29.904018: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 17:22:29.904300: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report

In [22]:
projection_data

[('the',
  array([ 0.40942523, -2.363802  ,  0.5362155 ,  3.4496198 , -3.42226   ,
         -2.0237877 ,  3.898841  ,  2.3582935 ,  0.60259956,  3.342635  ,
          0.03315959, -2.546651  ,  1.11909   , -0.06503467, -0.4401334 ,
         -0.6433584 , -1.6204582 , -1.7418787 ,  2.625317  , -0.73736846,
          1.6174169 ,  0.21351638,  2.3594728 ,  0.5002282 , -2.5179727 ,
         -2.0066767 ,  3.1498559 , -0.24258335, -2.3222916 , -4.523878  ,
          0.34949866, -2.2236454 ,  0.23627494, -1.1917362 ,  0.68455386,
         -4.606215  ,  0.15085751, -1.1520333 , -1.5195638 ,  0.8426711 ,
          3.8366442 , -0.06401085, -0.12027634,  0.22884983,  1.3478955 ,
         -0.5449428 ,  1.4345806 , -0.05077912, -0.7165911 , -0.7651505 ,
         -1.1478312 , -0.9166714 , -0.4309453 ,  1.3115768 , -1.5160681 ,
          2.2688015 ,  1.1226951 , -3.2762613 , -3.4657316 , -0.26861274,
         -2.8085814 , -1.0742836 , -0.22934614, -2.4656596 ,  1.9683998 ,
          0.48113525,  0.8242

Killed
