# Embedding with NLKT and Gensim
Copyright 2023, Denis Rothman

Installing libraries

In [None]:
!pip install --upgrade nltk -qq
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!pip install gensim -qq

In [None]:
import gensim
print(gensim.__version__)

4.3.2


# 1.Reading the text file

## Downloading the text file

Beginning of WIP code until GitHub made public

In [None]:
from google.colab import drive
drive.mount('/content/drive')
with open("drive/MyDrive/files/github.txt", "r") as f:
      github_token = f.readline().strip()

Mounted at /content/drive


In [None]:
#1.Load Decartes.txt using the Colab file manager
#2.Downloading the file from GitHub
#1.Load Decartes.txt using the Colab file manager
#2.Downloading the file from GitHub
!curl -H 'Authorization: token {github_token}' -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-and-Computer-Vision-3rd-Edition/master/Chapter11/Descartes.txt --output "Descartes.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    14  100    14    0     0     94      0 --:--:-- --:--:-- --:--:--    94


end of WIP code until book title finalized

## Reading the text file

In [None]:
with open('Descartes.txt', 'r', encoding='utf-8') as file:
    descartes_book = file.read().replace('\n', '')

# 2.Tokenizing the text with punkt

In [None]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(descartes_book)
print(len(tokens))

4


## Preprocessing the tokens


In [None]:
# applying lowercase, removing punctuation and stopwords
#stemming/lemmatization
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in string.punctuation]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(len(tokens))

2


In [None]:
unique_tokens = set(tokens)
print(len(unique_tokens))

2


In [None]:
print(unique_tokens)

{'404', 'found'}


In [None]:
tokens=unique_tokens
#print(len(tokens))

# 3.Embedding with Gensim and Word2Vec

In [None]:
from gensim.models import Word2Vec

# Train a Word2Vec model
model = Word2Vec([tokens],compute_loss=True,vector_size=300,min_count=1)

# Save the model for later use
model.save("descartes_word2vec.model")

# 4.Model description

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Load the model
model = Word2Vec.load("descartes_word2vec.model")

# Widget for the model attribute
attr_widget = widgets.Dropdown(
    options=['wv', 'vector_size', 'train_count', 'total_train_time', 'epochs','sg'],
    value='wv',
    description='Attribute:',
)
display(attr_widget)

# Widget for the number of lines
num_lines_widget = widgets.IntSlider(min=0, max=100, step=1, value=10, description='Lines:')
display(num_lines_widget)

# Button to display the data
display_button = widgets.Button(description='Display')
display(display_button)

# Function to display the data
def display_data(button):
    attr = attr_widget.value
    num_lines = num_lines_widget.value

    if attr == 'wv':
        words = list(model.wv.index_to_key)
        for word in words[:num_lines]:
            print(word, model.wv[word])
    else:
        print(getattr(model, attr))

# Link the function to the button
display_button.on_click(display_data)

Dropdown(description='Attribute:', options=('wv', 'vector_size', 'train_count', 'total_train_time', 'epochs', …

IntSlider(value=10, description='Lines:')

Button(description='Display', style=ButtonStyle())

## Accessing a word and a vector in the saved model

In [None]:
try:
    vector = model.wv['consciousness']
    print('Vector for "consciousness":', vector)
except KeyError:
    print('"consciousness" is not in the dictionary')

"consciousness" is not in the dictionary


In [None]:
try:
    vector = model.wv['conscious']
    print('Vector for "conscious":', vector)
except KeyError:
    print('"conscious" is not in the dictionary')

"conscious" is not in the dictionary


Most similar words

In [None]:
try:
    similar_words = model.wv.most_similar('conscious')
    print('Most similar words to "conscious":', similar_words)
except KeyError:
    print('"concious" is not in the dictionary')


"concious" is not in the dictionary


# 6.Exploring Gensim's vector space

## The dictionary of words

In [None]:
for word, index in model.wv.key_to_index.items():
    print(f"Word: {word}, Index: {index}")

Word: found, Index: 0
Word: 404, Index: 1


## Pairs of words and cosine similarity

In [None]:
import numpy as np
from gensim import matutils
import pandas as pd

# Define list of words
words = ["method","reason","truth","rightly", "science","seeking"]

# Initialize list to store results
data = []

# Iterate over all pairs of words
for i in range(len(words)):
    for j in range(len(words)):  # changed this line
        word1 = words[i]
        word2 = words[j]

        # Ensure words are in the model's vocabulary
        if word1 not in model.wv or word2 not in model.wv:
            print(f"One or both words ('{word1}', '{word2}') are not in the model's vocabulary.")
            continue

        # Calculate cosine similarity
        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))

        # Convert similarity to distance
        distance = 1 - similarity

        # Append to our results
        data.append({'word1': word1, 'word2': word2, 'distance': distance})

# Create DataFrame and display
df = pd.DataFrame(data)
display(df)

One or both words ('method', 'method') are not in the model's vocabulary.
One or both words ('method', 'reason') are not in the model's vocabulary.
One or both words ('method', 'truth') are not in the model's vocabulary.
One or both words ('method', 'rightly') are not in the model's vocabulary.
One or both words ('method', 'science') are not in the model's vocabulary.
One or both words ('method', 'seeking') are not in the model's vocabulary.
One or both words ('reason', 'method') are not in the model's vocabulary.
One or both words ('reason', 'reason') are not in the model's vocabulary.
One or both words ('reason', 'truth') are not in the model's vocabulary.
One or both words ('reason', 'rightly') are not in the model's vocabulary.
One or both words ('reason', 'science') are not in the model's vocabulary.
One or both words ('reason', 'seeking') are not in the model's vocabulary.
One or both words ('truth', 'method') are not in the model's vocabulary.
One or both words ('truth', 'reason

# 7.Tensorflow Projector

https://projector.tensorflow.org/

To visualize the embeddings with the TensorFlow Embedding Projector, you'll need to create two files: a vector file and a metadata file.

In [None]:
import csv
import os
import numpy as np

# Directory where you want to save the files
LOG_DIR = '/content'
os.makedirs(LOG_DIR, exist_ok=True)

# Get the words and vectors
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]

# Write the vectors to a .tsv file
with open(os.path.join(LOG_DIR, "vecs.tsv"), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(vectors)

# Write the labels (words) to a separate .tsv file
with open(os.path.join(LOG_DIR, "meta.tsv"), 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows([[word] for word in words])  # No header row

In [None]:
!echo "Vectors file (vecs.tsv) size:"
!wc -l /content/vecs.tsv
!echo "Metadata file (meta.tsv) size:"
!wc -l /content/meta.tsv

Vectors file (vecs.tsv) size:
2 /content/vecs.tsv
Metadata file (meta.tsv) size:
2 /content/meta.tsv
