# Embedding with NLKT and Gensim
Copyright 2023, Denis Rothman

Installing libraries

In [1]:
!pip install --upgrade nltk -qq
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install gensim -qq

In [4]:
import gensim
print(gensim.__version__)

4.3.1


# 1.Reading the text file

## Downloading the text file

Beginning of WIP code until GitHub made public

In [5]:
from google.colab import drive
drive.mount('/content/drive')
with open("drive/MyDrive/files/github.txt", "r") as f:
      PERSONAL_ACCESS_TOKEN = f.readline().strip()

Mounted at /content/drive


In [6]:
import requests
url=  'https://raw.githubusercontent.com/Denis2054/Transformers_3rd_Edition/main/Chapter11/Descartes.txt'
output_filename = 'Descartes.txt'
token =  PERSONAL_ACCESS_TOKEN

headers = {
    'Authorization': f'Token {token}'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    with open(output_filename, 'wb') as file:
        file.write(response.content)
        print('File downloaded successfully.')
else:
    print('Error downloading file.')

File downloaded successfully.


end of WIP code until book title finalized

In [7]:
#1.Load Decartes.txt using the Colab file manager
#2.Downloading the file from GitHub
#!curl -L https://raw.githubusercontent.com/Denis2054/Transformers_for_NLP_and_Computer_Vision_3rd_Edition/main/Chapter11/Descartes.txt --output "Descartes.txt"

## Reading the text file

In [8]:
with open('Descartes.txt', 'r', encoding='utf-8') as file:
    descartes_book = file.read().replace('\n', '')

# 2.Tokenizing the text with punkt

In [9]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(descartes_book)
print(len(tokens))

23605


## Preprocessing the tokens


In [10]:
# applying lowercase, removing punctuation and stopwords
#stemming/lemmatization
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in string.punctuation]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
print(len(tokens))

9781


In [12]:
unique_tokens = set(tokens)
print(len(unique_tokens))

3843


In [13]:
print(unique_tokens)

{'master', 'heavenssince', 'provision', 'preceptor', 'infant', 'incapable', 'feel', 'shaking', 'stimulate', 'embarrass', 'place', 'occupationthan', 'whichgeometers', 'intooperation', 'morallypossible', 'specific', 'judgement', 'memore', 'expelled', 'adequately', 'benefit', 'anythingmore', 'thesethings', 'reflection', 'observed', 'asleep', 'geometrical', 'naturallybe', 'thenumbers', 'consequent', 'certainknowledge', 'vein', 'tocompress', 'turnwith', 'averse', 'slowly', 'affection', 'evenidiots', 'people', 'composite', 'hardly', 'maybe', 'lawsof', 'andif', 'larger', 'perceive', 'thestars', 'eye', 'graduallyaugmenting', 'yet-the', 'preclude', 'required', 'unableto', 'selection', 'would', 'subsisting', 'necessity', 'indeed', 'thepublic', 'fullness', 'gather', 'dissected', 'passion', 'convenience', 'objected', 'stillin', 'conceal', 'yet', 'generalform', 'hostile', 'perusal', 'wewish', 'theopening', 'tothis', 'haveproceeded', 'longer', 'hay', 'confusion', '_second_', 'escape', 'arrangement',

In [14]:
tokens=unique_tokens
#print(len(tokens))

# 3.Embedding with Gensim and Word2Vec

In [15]:
from gensim.models import Word2Vec

# Train a Word2Vec model
model = Word2Vec([tokens],compute_loss=True,vector_size=300,min_count=1)

# Save the model for later use
model.save("descartes_word2vec.model")

# 4.Model description

In [37]:
from IPython.display import display
import ipywidgets as widgets

# Load the model
model = Word2Vec.load("descartes_word2vec.model")

# Widget for the model attribute
attr_widget = widgets.Dropdown(
    options=['wv', 'vector_size', 'train_count', 'total_train_time', 'epochs','sg'],
    value='wv',
    description='Attribute:',
)
display(attr_widget)

# Widget for the number of lines
num_lines_widget = widgets.IntSlider(min=0, max=100, step=1, value=10, description='Lines:')
display(num_lines_widget)

# Button to display the data
display_button = widgets.Button(description='Display')
display(display_button)

# Function to display the data
def display_data(button):
    attr = attr_widget.value
    num_lines = num_lines_widget.value

    if attr == 'wv':
        words = list(model.wv.index_to_key)
        for word in words[:num_lines]:
            print(word, model.wv[word])
    else:
        print(getattr(model, attr))

# Link the function to the button
display_button.on_click(display_data)

Dropdown(description='Attribute:', options=('wv', 'vector_size', 'train_count', 'total_train_time', 'epochs', …

IntSlider(value=10, description='Lines:')

Button(description='Display', style=ButtonStyle())

## Accessing a word and a vector in the saved model

In [38]:
try:
    vector = model.wv['consciousness']
    print('Vector for "consciousness":', vector)
except KeyError:
    print('"consciousness" is not in the dictionary')

"consciousness" is not in the dictionary


In [42]:
try:
    vector = model.wv['conscious']
    print('Vector for "conscious":', vector)
except KeyError:
    print('"conscious" is not in the dictionary')

Vector for "conscious": [-2.2882714e-03 -2.9769554e-03 -2.3382956e-03  3.2302644e-03
  4.9209874e-04 -7.2649046e-04  8.2537113e-04 -3.6820408e-04
 -1.1185763e-03 -1.8049215e-03  2.1578181e-03 -1.0867673e-03
  7.9940347e-04  2.3183119e-03 -1.7192229e-04  2.8174506e-03
 -1.5929197e-03  3.3099041e-03 -1.9293354e-03 -3.0342239e-04
  1.4344892e-04  3.6501623e-04  1.3159955e-03  2.5331231e-03
 -9.0605288e-04  1.5739332e-03 -1.6196299e-03  9.4100273e-05
 -7.1035558e-04 -9.2428079e-04 -6.3189666e-04  3.4200953e-04
 -2.4409273e-03 -3.1052763e-04  3.0026999e-03  1.3197570e-03
 -2.1523330e-03  2.0124663e-03 -1.1514931e-03 -8.1804878e-04
 -3.2101272e-04 -1.2864619e-03 -2.2771226e-03 -9.3669083e-04
  7.6174433e-04 -2.3376187e-03 -3.2829645e-04  7.2167849e-04
  2.2322435e-03  2.4025899e-03  2.2884845e-03 -2.8313999e-03
  2.1352360e-03  9.8904688e-04 -1.6510736e-03 -9.5069727e-06
  3.0214142e-03  8.9379435e-04 -1.3311246e-03 -1.0411283e-03
 -1.4536026e-03  1.0804230e-03 -2.5431177e-04  1.3866347e-03


Most similar words

In [43]:
try:
    similar_words = model.wv.most_similar('conscious')
    print('Most similar words to "conscious":', similar_words)
except KeyError:
    print('"concious" is not in the dictionary')


Most similar words to "conscious": [('wouldconduce', 0.21150538325309753), ('itscounterweights', 0.19385728240013123), ('versed', 0.185345858335495), ('comprehend', 0.1835186779499054), ('requisite', 0.17346185445785522), ('convertsthe', 0.1722400188446045), ('ibelieved', 0.1657719910144806), ('transmit', 0.16429133713245392), ('speakingmyself', 0.16091690957546234), ('warmth', 0.15618115663528442)]


# 6.Exploring Gensim's vector space

## The dictionary of words

In [23]:
for word, index in model.wv.key_to_index.items():
    print(f"Word: {word}, Index: {index}")

Word: proceedsfrom, Index: 0
Word: faculty, Index: 1
Word: ill-founded, Index: 2
Word: bymachines, Index: 3
Word: solelyfrom, Index: 4
Word: passing, Index: 5
Word: guard, Index: 6
Word: shade, Index: 7
Word: ori, Index: 8
Word: toimitate, Index: 9
Word: street, Index: 10
Word: coronation, Index: 11
Word: reference, Index: 12
Word: ofreputation, Index: 13
Word: thetrouble, Index: 14
Word: deduced, Index: 15
Word: essaying, Index: 16
Word: command, Index: 17
Word: conjecture, Index: 18
Word: tothink, Index: 19
Word: west, Index: 20
Word: adequate, Index: 21
Word: whatis, Index: 22
Word: ofmathematical, Index: 23
Word: diversity, Index: 24
Word: abovedoubt, Index: 25
Word: dependent, Index: 26
Word: thingto, Index: 27
Word: trial, Index: 28
Word: compensated, Index: 29
Word: reasoningsare, Index: 30
Word: fermentation, Index: 31
Word: arrive, Index: 32
Word: chancerather, Index: 33
Word: eventhe, Index: 34
Word: thoughi, Index: 35
Word: matteraccording, Index: 36
Word: reason, Index: 37


## Pairs of words and cosine similarity

In [None]:
import numpy as np
from gensim import matutils
import pandas as pd

# Define list of words
words = ["method","reason","truth","rightly", "science","seeking"]

# Initialize list to store results
data = []

# Iterate over all pairs of words
for i in range(len(words)):
    for j in range(len(words)):  # changed this line
        word1 = words[i]
        word2 = words[j]

        # Ensure words are in the model's vocabulary
        if word1 not in model.wv or word2 not in model.wv:
            print(f"One or both words ('{word1}', '{word2}') are not in the model's vocabulary.")
            continue

        # Calculate cosine similarity
        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))

        # Convert similarity to distance
        distance = 1 - similarity

        # Append to our results
        data.append({'word1': word1, 'word2': word2, 'distance': distance})

# Create DataFrame and display
df = pd.DataFrame(data)
display(df)

# 7.Tensorflow Projector

https://projector.tensorflow.org/

To visualize the embeddings with the TensorFlow Embedding Projector, you'll need to create two files: a vector file and a metadata file.

In [28]:
import csv
import os
import numpy as np

# Directory where you want to save the files
LOG_DIR = '/content'
os.makedirs(LOG_DIR, exist_ok=True)

# Get the words and vectors
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]

# Write the vectors to a .tsv file
with open(os.path.join(LOG_DIR, "vecs.tsv"), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(vectors)

# Write the labels (words) to a separate .tsv file
with open(os.path.join(LOG_DIR, "meta.tsv"), 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows([[word] for word in words])  # No header row

In [29]:
!echo "Vectors file (vecs.tsv) size:"
!wc -l /content/vecs.tsv
!echo "Metadata file (meta.tsv) size:"
!wc -l /content/meta.tsv

Vectors file (vecs.tsv) size:
3843 /content/vecs.tsv
Metadata file (meta.tsv) size:
3843 /content/meta.tsv
