# Embedding with NLKT and Gensim
Copyright 2023, Denis Rothman

Installing libraries

In [None]:
!pip install --upgrade nltk -qq
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install gensim -qq

In [None]:
import gensim
print(gensim.__version__)

4.3.1


# 1.Reading the text file

## Downloading the text file

Beginning of WIP code until GitHub made public

In [None]:
from google.colab import drive
drive.mount('/content/drive')
with open("drive/MyDrive/files/github.txt", "r") as f:
      PERSONAL_ACCESS_TOKEN = f.readline().strip()

Mounted at /content/drive


In [None]:
import requests
url=  'https://raw.githubusercontent.com/Denis2054/Transformers_3rd_Edition/main/Chapter11/Descartes.txt'
output_filename = 'Descartes.txt'
token =  PERSONAL_ACCESS_TOKEN

headers = {
    'Authorization': f'Token {token}'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    with open(output_filename, 'wb') as file:
        file.write(response.content)
        print('File downloaded successfully.')
else:
    print('Error downloading file.')

File downloaded successfully.


end of WIP code until book title finalized

In [None]:
#1.Load Decartes.txt using the Colab file manager
#2.Downloading the file from GitHub
#!curl -L https://raw.githubusercontent.com/Denis2054/Transformers_for_NLP_and_Computer_Vision_3rd_Edition/main/Chapter11/Decartes.txt --output "Decartes.txt"

## Reading the text file

In [None]:
with open('Descartes.txt', 'r', encoding='utf-8') as file:
    descartes_book = file.read().replace('\n', '')

# 2.Tokenizing the text with punkt

In [None]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(descartes_book)

## Preprocessing the tokens


In [None]:
# applying lowercase, removing punctuation and stopwords
#stemming/lemmatization
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in string.punctuation]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(len(tokens))

9781


In [None]:
unique_tokens = set(tokens)
print(unique_tokens)


{'pellicle', 'manifested', 'overthrown', 'tothose', 'sodoubtful', 'mine', 'obliged', 'grace', 'draft', 'endeavored', 'indicate', 'ishould', 'treat', 'quitecertain', 'forestall', 'thither', 'whoseexistence', 'circumstanced', 'havehad', 'learnedthem', 'truly', 'andthat', 'lattercurrent', 'allother', 'ofgood', 'counseled', 'entertainingsuch', 'adue', 'credit', 'feigned', 'partialto', 'know', 'parrot', 'begin', 'detect', 'something', 'theschools', 'piece', 'usuallyin', 'appeared', 'toawaken.it', 'ornament', 'ever', 'willnot', 'heartwere', 'room', 'comparison', 'succeeded', 'profess', 'whentheir', 'reason.for', 'sufficientknowledge', 'sixprincipal', 'fly', 'toerror', 'thought', 'tospeak', 'admitted', 'byno', 'whocannot', 'chiefly', 'inadequate', 'artery', 'perceiveanything', 'sciencesby', 'toopinion', 'cavityin', 'nature', 'havingadopted', 'ideasmight', 'feebleresolution', 'employment', 'tosurmount', 'line', 'occasioned', 'execute', 'wepossess', 'ofsatisfaction', 'vanity', 'possessing', 'si

# 3.Embedding with Gensim and Word2Vec

In [None]:
from gensim.models import Word2Vec

# Train a Word2Vec model
model = Word2Vec([tokens],compute_loss=True,vector_size=300,min_count=1)

# Save the model for later use
model.save("descartes_word2vec.model")

# 4.Model description

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Load the model
model = Word2Vec.load("descartes_word2vec.model")

# Widget for the model attribute
attr_widget = widgets.Dropdown(
    options=['wv', 'vector_size', 'train_count', 'total_train_time', 'epochs'],
    value='wv',
    description='Attribute:',
)
display(attr_widget)

# Widget for the number of lines
num_lines_widget = widgets.IntSlider(min=0, max=100, step=1, value=10, description='Lines:')
display(num_lines_widget)

# Button to display the data
display_button = widgets.Button(description='Display')
display(display_button)

# Function to display the data
def display_data(button):
    attr = attr_widget.value
    num_lines = num_lines_widget.value

    if attr == 'wv':
        words = list(model.wv.index_to_key)
        for word in words[:num_lines]:
            print(word, model.wv[word])
    else:
        print(getattr(model, attr))

# Link the function to the button
display_button.on_click(display_data)

Dropdown(description='Attribute:', options=('wv', 'vector_size', 'train_count', 'total_train_time', 'epochs'),…

IntSlider(value=10, description='Lines:')

Button(description='Display', style=ButtonStyle())

In [None]:
# Load the model
model = Word2Vec.load("descartes_word2vec.model")

# Print all attributes of the model
for attr in dir(model):
    if not attr.startswith('_'):  # Skip internal attributes
        print(attr)


# 5.Accessing a Word and vector

In [None]:
try:
    vector = model.wv['consciousness']
    print('Vector for "consciousness":', vector)
except KeyError:
    print('"consciousness" is not in the dictionary')

"consciousness" is not in the dictionary


In [None]:
try:
    vector = model.wv['think']
    print('Vector for "think":', vector)
except KeyError:
    print('"think" is not in the dictionary')


In [None]:
print(len(vector))

300


In [None]:
try:
    similar_words = model.wv.most_similar('think')
    print('Most similar words to "think":', similar_words)
except KeyError:
    print('"think" is not in the dictionary')


Most similar words to "think": [('part', 0.23273444175720215), ('heart', 0.2167525440454483), ('oflower', 0.216399148106575), ('dissected', 0.21530592441558838), ('make', 0.21463941037654877), ('object', 0.21401740610599518), ('person', 0.2136993557214737), ('reason', 0.21088391542434692), ('andif', 0.21082493662834167), ('thisopinion', 0.20992112159729004)]


In [None]:
import ipywidgets as widgets
from IPython.display import display

# Function to get vector
def get_vector(word):
    try:
        vector = model.wv[word]
        print('Vector for "{}":'.format(word), vector)
    except KeyError:
        print('"{}" is not in the dictionary'.format(word))

# Function to get similar words
def get_similar_words(word):
    try:
        similar_words = model.wv.most_similar(word)
        print('Most similar words to "{}":'.format(word))
        for word, similarity in similar_words:
            print('  {}: {:.2f}'.format(word, similarity))
    except KeyError:
        print('"{}" is not in the dictionary'.format(word))

# Create widgets
vector_widget = widgets.interactive(get_vector, word=widgets.Text(placeholder='Type a word'));
similar_words_widget = widgets.interactive(get_similar_words, word=widgets.Text(placeholder='Type a word'));

# Display widgets
display(vector_widget)
display(similar_words_widget)


# 6.Exploring Gensim's vector space

In [None]:
for word, index in model.wv.key_to_index.items():
    print(f"Word: {word}, Index: {index}")

Word: one, Index: 0
Word: truth, Index: 1
Word: thought, Index: 2
Word: reason, Index: 3
Word: may, Index: 4
Word: could, Index: 5
Word: heart, Index: 6
Word: u, Index: 7
Word: certain, Index: 8
Word: might, Index: 9
Word: even, Index: 10
Word: much, Index: 11
Word: many, Index: 12
Word: opinion, Index: 13
Word: would, Index: 14
Word: blood, Index: 15
Word: without, Index: 16
Word: time, Index: 17
Word: others, Index: 18
Word: nature, Index: 19
Word: since, Index: 20
Word: body, Index: 21
Word: object, Index: 22
Word: men, Index: 23
Word: must, Index: 24
Word: thus, Index: 25
Word: mind, Index: 26
Word: god, Index: 27
Word: nothing, Index: 28
Word: first, Index: 29
Word: place, Index: 30
Word: make, Index: 31
Word: way, Index: 32
Word: true, Index: 33
Word: although, Index: 34
Word: vein, Index: 35
Word: part, Index: 36
Word: matter, Index: 37
Word: two, Index: 38
Word: thing, Index: 39
Word: found, Index: 40
Word: principle, Index: 41
Word: perhaps, Index: 42
Word: yet, Index: 43
Word

In [None]:
from IPython.display import display
import ipywidgets as widgets
from gensim import matutils

# Widget for the first word
word1_widget = widgets.Text(value='', description='Word 1:')
display(word1_widget)

# Widget for the second word
word2_widget = widgets.Text(value='', description='Word 2:')
display(word2_widget)

# Button to calculate similarity
calc_button = widgets.Button(description='Calculate Similarity')
display(calc_button)

# Function to calculate similarity
def calculate_similarity(button):
    word1 = word1_widget.value
    word2 = word2_widget.value

    if word1 not in model.wv or word2 not in model.wv:
        print("One or both words are not in the model's vocabulary.")
        return

    vec1 = model.wv[word1]
    vec2 = model.wv[word2]
    similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    print(f"The cosine similarity between '{word1}' and '{word2}' is {similarity}.")

# Link the function to the button
calc_button.on_click(calculate_similarity)

Text(value='', description='Word 1:')

Text(value='', description='Word 2:')

Button(description='Calculate Similarity', style=ButtonStyle())

The cosine similarity between 'think' and 'smoke' is 0.04001082479953766.


In [None]:
#words = ["METHOD","REASON,","TRUTH","RIGHTLY", "SCIENCES","SEEKING"]

In [None]:
import numpy as np
from gensim import matutils
import pandas as pd

# Define list of words
words = ["method","reason","truth","rightly", "science","seeking"]

# Initialize list to store results
data = []

# Iterate over all pairs of words
for i in range(len(words)):
    for j in range(len(words)):  # changed this line
        word1 = words[i]
        word2 = words[j]

        # Ensure words are in the model's vocabulary
        if word1 not in model.wv or word2 not in model.wv:
            print(f"One or both words ('{word1}', '{word2}') are not in the model's vocabulary.")
            continue

        # Calculate cosine similarity
        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))

        # Convert similarity to distance
        distance = 1 - similarity

        # Append to our results
        data.append({'word1': word1, 'word2': word2, 'distance': distance})

# Create DataFrame and display
df = pd.DataFrame(data)
display(df)

Unnamed: 0,word1,word2,distance
0,method,method,0.0
1,method,reason,0.9227734
2,method,truth,0.9341647
3,method,rightly,1.091469
4,method,science,0.9106471
5,method,seeking,1.037626
6,reason,method,0.9227734
7,reason,reason,5.960464e-08
8,reason,truth,0.7917398
9,reason,rightly,0.9269378


In [None]:
# Compute the total distance
total_distance = df['distance'].sum()
print(f'Total distance: {total_distance}')

Total distance: 27.93750872835517


# 7.Tensorflow Projector

https://projector.tensorflow.org/

To visualize the embeddings with the TensorFlow Embedding Projector, you'll need to create two files:

A **vector** file (usually called vecs.tsv) containing the embeddings.
A **metadat**a file (usually called meta.tsv) containing the labels, which in this case are the words.
Here's an example of how to create these files from a gensim Word2Vec model:

In [None]:
import csv
import os
import numpy as np

# Directory where you want to save the files
LOG_DIR = '/content'
os.makedirs(LOG_DIR, exist_ok=True)

# Get the words and vectors
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]

# Write the vectors to a .tsv file
with open(os.path.join(LOG_DIR, "vecs.tsv"), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(vectors)

# Write the labels (words) to a separate .tsv file
with open(os.path.join(LOG_DIR, "meta.tsv"), 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows([[word] for word in words])  # No header row

Once you've run this code, you'll have two files: vecs.tsv and meta.tsv. You can upload these files to the TensorFlow Embedding Projector to visualize your word embeddings.

In the projector, each point in the visualization corresponds to a word. The position of the point is determined by the word's vector, and the label of the point is the word itself.

Make sure that both files are correctly formatted and that they both contain the same number of lines (except for the header line in meta.tsv, if you included it). If you have issues with the files, the projector might not be able to load your data correctly.

Remember that the vectors and labels must be in the same order in their respective files. That's why we get the words and vectors at the same time in the code above, using the same list of keys from the model.

In [None]:
!echo "Vectors file (vecs.tsv) size:"
!wc -l /content/vecs.tsv
!echo "Metadata file (meta.tsv) size:"
!wc -l /content/meta.tsv

Vectors file (vecs.tsv) size:
3843 /content/vecs.tsv
Metadata file (meta.tsv) size:
3843 /content/meta.tsv


This code uses the wc -l command to count the number of lines in each file. The output will give you the number of lines in each file, which should match if the files were generated correctly.

Check the data in your files. You can use the following commands to print the first few lines of your metadata and vector files:

In [None]:
!head /content/meta.tsv

one
truth
thought
reason
may
could
heart
u
certain
might
