# Embedding with NLKT and Gensim
Copyright 2024, Denis Rothman

Installing libraries

In [1]:
!pip install --upgrade nltk -qq
import nltk

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.5/1.5 MB[0m [31m18.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install gensim -qq

In [4]:
import gensim
print(gensim.__version__)

4.3.3


# 1.Reading the text file

## Downloading the text file

In [5]:
#1.Load Decartes.txt using the Colab file manager
#2.Downloading the file from GitHub
!curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-and-Computer-Vision-3rd-Edition/master/Chapter11/Descartes.txt --output "Descartes.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  127k  100  127k    0     0   386k      0 --:--:-- --:--:-- --:--:--  387k


end of WIP code until book title finalized

## Reading the text file

In [6]:
with open('Descartes.txt', 'r', encoding='utf-8') as file:
    descartes_book = file.read().replace('\n', '')

# 2.Tokenizing the text with punkt

In [7]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
tokens = word_tokenize(descartes_book)
print(len(tokens))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


23605


## Preprocessing the tokens


In [8]:
# applying lowercase, removing punctuation and stopwords
#stemming/lemmatization
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in string.punctuation]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
print(len(tokens))

9781


In [10]:
unique_tokens = set(tokens)
print(len(unique_tokens))

3843


In [11]:
print(unique_tokens)

{'thatwas', 'severalbuildings', 'well', 'constructed', 'existing', 'legislator', 'thanmyself', 'internal', 'oflower', 'deathmay', 'havealleged', 'choose', 'mostmoderate', 'freely', 'imagination', 'feeble', 'heart.but', 'ancientsonly', 'elevate', 'partly', 'defend', 'tosuspend', 'other.also', 'juice', 'compel', 'species.i', 'persuadedof', 'howeverdifferent', 'theconviction', 'effectsthat', 'hoped', 'observed', 'downwards', 'infine', 'bound', 'areasleep', 'subsist', 'eradicating', 'moreeasily', 'promise', 'crookedness', 'auricle', 'extricated', 'wellseize', 'accuracy', 'essayed', 'publisher', 'class', 'least', 'beenguided', 'probably', 'intelligent', 'tested', 'ofno', 'begun', 'mode', 'stimulate', 'quitted', 'judicious', 'asbefore', 'closely', 'sanction', 'theprinciples', 'examine', 'liability', 'outby', 'moved', 'truefrom', 'concatenation', 'could', 'conduct', 'beentaught', 'magnificentpalaces', 'demonstrating', 'canever', 'cast', 'anticipating', 'aughtexcept', 'provision', 'togive', 'a

In [12]:
tokens=unique_tokens
#print(len(tokens))

# 3.Embedding with Gensim and Word2Vec

In [13]:
from gensim.models import Word2Vec

# Train a Word2Vec model
model = Word2Vec([tokens],compute_loss=True,vector_size=300,min_count=1)

# Save the model for later use
model.save("descartes_word2vec.model")

# 4.Model description

In [14]:
from IPython.display import display
import ipywidgets as widgets

# Load the model
model = Word2Vec.load("descartes_word2vec.model")

# Widget for the model attribute
attr_widget = widgets.Dropdown(
    options=['wv', 'vector_size', 'train_count', 'total_train_time', 'epochs','sg'],
    value='wv',
    description='Attribute:',
)
display(attr_widget)

# Widget for the number of lines
num_lines_widget = widgets.IntSlider(min=0, max=100, step=1, value=10, description='Lines:')
display(num_lines_widget)

# Button to display the data
display_button = widgets.Button(description='Display')
display(display_button)

# Function to display the data
def display_data(button):
    attr = attr_widget.value
    num_lines = num_lines_widget.value

    if attr == 'wv':
        words = list(model.wv.index_to_key)
        for word in words[:num_lines]:
            print(word, model.wv[word])
    else:
        print(getattr(model, attr))

# Link the function to the button
display_button.on_click(display_data)

Dropdown(description='Attribute:', options=('wv', 'vector_size', 'train_count', 'total_train_time', 'epochs', …

IntSlider(value=10, description='Lines:')

Button(description='Display', style=ButtonStyle())

## Accessing a word and a vector in the saved model

In [15]:
try:
    vector = model.wv['consciousness']
    print('Vector for "consciousness":', vector)
except KeyError:
    print('"consciousness" is not in the dictionary')

"consciousness" is not in the dictionary


In [16]:
try:
    vector = model.wv['conscious']
    print('Vector for "conscious":', vector)
except KeyError:
    print('"conscious" is not in the dictionary')

Vector for "conscious": [ 2.12347927e-03  2.48884037e-03 -1.09817157e-03  3.01365857e-03
 -2.39122426e-03 -2.30331300e-03 -1.77994464e-03  1.57845661e-03
 -2.80489842e-03  1.31214224e-03  2.06384854e-03 -1.66483445e-03
  2.62877112e-03  2.89368705e-04 -9.51575523e-04  1.26453000e-03
 -3.82143364e-04  2.02550041e-03  1.83023908e-03 -7.74523476e-04
 -1.00194628e-03 -3.02865566e-03  1.96098443e-03  3.25448019e-03
  2.86629447e-03  9.99412732e-04  3.15685465e-05 -2.97670515e-04
  1.23886100e-03  2.54072086e-03 -2.17626640e-03  2.14392343e-03
  1.85425719e-03 -1.43096200e-03 -1.66846134e-04  2.84783862e-04
  9.13287629e-04  2.34053889e-03  1.62188997e-04  3.26978206e-03
 -2.77023762e-03 -1.12022471e-03 -2.52842816e-04  1.04978120e-04
  2.98166787e-03 -1.50026102e-03  9.23200059e-05 -1.74628093e-03
  2.06454052e-03 -2.73499428e-03  2.82816868e-03 -2.22104951e-03
 -3.14411498e-03  2.11202633e-03 -1.16350350e-03 -2.64330022e-03
  3.08189847e-05  3.14050354e-03 -1.49226841e-03  2.70287995e-03
 

Most similar words

In [17]:
try:
    similar_words = model.wv.most_similar('conscious')
    print('Most similar words to "conscious":', similar_words)
except KeyError:
    print('"concious" is not in the dictionary')


Most similar words to "conscious": [('turnwith', 0.21066632866859436), ('iknew', 0.19875670969486237), ('time', 0.18317890167236328), ('yield', 0.18000663816928864), ('andbusy', 0.1776650995016098), ('fermentation', 0.1682536005973816), ('falsityis', 0.16267628967761993), ('distinguishing', 0.16186277568340302), ('thanit', 0.157793790102005), ('dilate', 0.15621301531791687)]


# 6.Exploring Gensim's vector space

## The dictionary of words

In [18]:
for word, index in model.wv.key_to_index.items():
    print(f"Word: {word}, Index: {index}")

Word: fullness, Index: 0
Word: theground, Index: 1
Word: havingbroken, Index: 2
Word: similarvictories, Index: 3
Word: pertains, Index: 4
Word: fame, Index: 5
Word: thisaccount, Index: 6
Word: andremarked, Index: 7
Word: history, Index: 8
Word: modern, Index: 9
Word: credit, Index: 10
Word: remarkable, Index: 11
Word: recourse, Index: 12
Word: ofall, Index: 13
Word: satisfy, Index: 14
Word: incertitude, Index: 15
Word: issuing, Index: 16
Word: idea, Index: 17
Word: befound, Index: 18
Word: concern, Index: 19
Word: underthe, Index: 20
Word: expresslysupposed, Index: 21
Word: increase, Index: 22
Word: using, Index: 23
Word: oughtfor, Index: 24
Word: justifythe, Index: 25
Word: sparta, Index: 26
Word: ebb, Index: 27
Word: iron, Index: 28
Word: incumbent, Index: 29
Word: published, Index: 30
Word: theextravagances, Index: 31
Word: isimpossible, Index: 32
Word: hadsufficiently, Index: 33
Word: caveinto, Index: 34
Word: project, Index: 35
Word: vast, Index: 36
Word: unknown, Index: 37
Word: 

## Pairs of words and cosine similarity

In [19]:
import numpy as np
from gensim import matutils
import pandas as pd

# Define list of words
words = ["method","reason","truth","rightly", "science","seeking"]

# Initialize list to store results
data = []

# Iterate over all pairs of words
for i in range(len(words)):
    for j in range(len(words)):  # changed this line
        word1 = words[i]
        word2 = words[j]

        # Ensure words are in the model's vocabulary
        if word1 not in model.wv or word2 not in model.wv:
            print(f"One or both words ('{word1}', '{word2}') are not in the model's vocabulary.")
            continue

        # Calculate cosine similarity
        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))

        # Convert similarity to distance
        distance = 1 - similarity

        # Append to our results
        data.append({'word1': word1, 'word2': word2, 'distance': distance})

# Create DataFrame and display
df = pd.DataFrame(data)
display(df)

Unnamed: 0,word1,word2,distance
0,method,method,0.0
1,method,reason,0.975557
2,method,truth,0.942923
3,method,rightly,1.030018
4,method,science,1.005227
5,method,seeking,0.994202
6,reason,method,0.975557
7,reason,reason,0.0
8,reason,truth,0.975894
9,reason,rightly,0.943911


# 7.Tensorflow Projector

https://projector.tensorflow.org/

To visualize the embeddings with the TensorFlow Embedding Projector, you'll need to create two files: a vector file and a metadata file.

In [20]:
import csv
import os
import numpy as np

# Directory where you want to save the files
LOG_DIR = '/content'
os.makedirs(LOG_DIR, exist_ok=True)

# Get the words and vectors
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]

# Write the vectors to a .tsv file
with open(os.path.join(LOG_DIR, "vecs.tsv"), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(vectors)

# Write the labels (words) to a separate .tsv file
with open(os.path.join(LOG_DIR, "meta.tsv"), 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows([[word] for word in words])  # No header row

In [21]:
!echo "Vectors file (vecs.tsv) size:"
!wc -l /content/vecs.tsv
!echo "Metadata file (meta.tsv) size:"
!wc -l /content/meta.tsv

Vectors file (vecs.tsv) size:
3843 /content/vecs.tsv
Metadata file (meta.tsv) size:
3843 /content/meta.tsv
