# **Static Embeddings**

In [1]:
import numpy as np
from gensim.models import Word2Vec

sentence = 'Knowledge is power, but enthusiasm pulls the switch'

# Preprocess the sentence
words = sentence.lower().split()

# Train the Word2Vec model
model = Word2Vec([words], vector_size=16, window=5, min_count=1, workers=4)

# Get word embeddings
for word in words:
    print(f"Word: {word}, Embedding: {model.wv[word]}")

Word: knowledge, Embedding: [-0.03500008  0.04440337  0.02095337  0.04516044  0.04250155  0.04706714
 -0.02368221 -0.00351129  0.01467735 -0.02824395  0.05242957 -0.06161352
  0.04227901  0.0182151  -0.0308302   0.02748867]
Word: is, Embedding: [-0.05573369 -0.04400975  0.0056341   0.03995334 -0.05387305  0.02291086
  0.03243677  0.03588711  0.04666824 -0.03854797  0.00691009  0.03779551
 -0.01775032 -0.03858452 -0.00256389 -0.05230593]
Word: power,, Embedding: [-0.00535348  0.01766602  0.03375893  0.0440791  -0.03564451  0.01161762
  0.0380554  -0.02998782 -0.01942038  0.04248518  0.01019672  0.00118698
  0.02171023  0.00136111  0.06011766  0.03162877]
Word: but, Embedding: [-0.00942505  0.01543622 -0.00555017  0.03458539 -0.01714361  0.01412541
  0.03409871  0.05216221 -0.00908588 -0.05755089  0.02731595  0.00357366
  0.04651193 -0.00508302 -0.01649009 -0.05470631]
Word: enthusiasm, Embedding: [ 0.03113786  0.05770715 -0.05098698  0.02809874 -0.02585673  0.00515335
  0.05311637 -0.02

# **Contextual Embeddings**

In [2]:
!pip install transformers



In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Input
sentence = "Knowledge is power, but enthusiasm pulls the switch"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Tokenize with fixed padding and truncation
inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=10)

In [5]:
# Run the model
outputs = model(**inputs)

# Get the embeddings (last_hidden_state)
# These embeddings are part of the computation graph that PyTorch builds for calculating gradients.
# detach() is used to remove these embeddings from the computation graph (Saving money and computational power).
contextual_embeddings = outputs.last_hidden_state.detach()

In [6]:
# Check the shape
print(f"Contextual Embeddings : {contextual_embeddings} \n Shape: {contextual_embeddings.shape}")
print("\n----------------------------------------------------------------\n")
# Dimensionality reduction to 16
# We can use a linear transformation to reduce from 768 to 16 dimensions
linear_layer = torch.nn.Linear(768, 16)
contextual_embedding_reduced = linear_layer(contextual_embeddings).detach().numpy()

# Check new shape
print(f"Reduced Contextual Embedding : {contextual_embedding_reduced} \n Shape: {contextual_embedding_reduced.shape}")

Contextual Embeddings : tensor([[[ 8.5051e-02,  8.8734e-02,  4.1578e-03,  ..., -2.3565e-01,
           2.6658e-01,  4.9274e-01],
         [ 2.4075e-01,  2.8072e-01,  5.9313e-01,  ...,  1.5525e-02,
           9.3277e-01, -8.4173e-02],
         [-2.4953e-01,  3.3956e-01,  5.4684e-01,  ..., -2.8481e-01,
          -1.0912e-01,  8.0139e-01],
         ...,
         [ 3.5404e-04, -2.1287e-01,  7.7069e-01,  ..., -2.8971e-01,
          -5.2090e-01,  5.3809e-01],
         [-6.7954e-01, -9.3337e-01, -4.9169e-02,  ...,  4.2935e-01,
           8.7299e-02, -6.5073e-02],
         [ 9.0462e-01,  2.1638e-01,  1.6207e-01,  ...,  5.4705e-02,
          -5.7996e-01, -6.7091e-01]]]) 
 Shape: torch.Size([1, 10, 768])

----------------------------------------------------------------

Reduced Contextual Embedding : [[[-0.09891652  0.45738494 -0.21787538  0.26942688  0.09800021
    0.4518376  -0.16058579  0.73541987  0.01510914  0.13160259
   -0.59913373 -0.1023269  -0.15293786  0.04506467  0.25658676
   -0.077

### **Custom Embeddings**

In [7]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### **Note:**
**Random seed** is to ensure reproducibility. It initializes the random number generator in a fixed, predictable way so that:
1.	*Same Results Across Runs:* The same code with the same seed will always produce the same random numbers, embeddings, or model initialization.
2.	*Debugging:* It makes it easier to debug models since you can reproduce experiments consistently.
3.	*Experiment Comparison:* Ensures fair comparison between models by starting them with the same initial conditions.

In [8]:
# Set the random seed globally
SEED = 42
tf.random.set_seed(SEED) # for TensorFlow operations.
np.random.seed(SEED) # for NumPy operations.
random.seed(SEED) # for Python's built-in random module.

In [9]:
sentence = 'Knowledge is power, but enthusiasm pulls the switch'

# Tokenizing the sentence
tokenizer = Tokenizer() # The Tokenizer is used to map each word in a text to a unique integer ID.
tokenizer.fit_on_texts([sentence]) # The fit_on_texts() method learns the vocabulary and builds a dictionary mapping each word to a unique index.
sequences = tokenizer.texts_to_sequences([sentence]) # The texts_to_sequences() method converts the actual text into sequences of integer IDs, where each ID corresponds to a word in the vocabulary.

In [10]:
print(tokenizer.word_index)
print(sequences)

{'knowledge': 1, 'is': 2, 'power': 3, 'but': 4, 'enthusiasm': 5, 'pulls': 6, 'the': 7, 'switch': 8}
[[1, 2, 3, 4, 5, 6, 7, 8]]


In [11]:
# Padding sequences to ensure uniform length
max_length = 10 # max_length refers to the maximum number of tokens (words) in a sequence
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [12]:
# Define the embedding layer
embedding_dim = 16  # Dimension of the embedding vector
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                            output_dim=embedding_dim,
                            input_length=max_length,
                            embeddings_initializer=tf.keras.initializers.RandomUniform(seed=SEED)) # Ensure deterministic initialization of embedding weights.



In [13]:
# Create a simple model to retrieve embeddings
model = Sequential()
model.add(embedding_layer)

In [14]:
# Get embeddings for the sentence
embeddings = model.predict(padded_sequences)
print(f"Embeddings:\n {embeddings} \n {embeddings.shape}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
Embeddings:
 [[[-2.17566844e-02 -1.58309937e-04  2.15458535e-02 -4.79797982e-02
   -4.21401858e-02  1.26559399e-02  2.54998319e-02 -3.26794162e-02
   -3.71287353e-02 -4.47329544e-02  4.42529209e-02 -2.69548055e-02
   -2.24584341e-03 -2.53444910e-02  2.24576034e-02  2.69065835e-02]
  [-2.17566844e-02 -1.58309937e-04  2.15458535e-02 -4.79797982e-02
   -4.21401858e-02  1.26559399e-02  2.54998319e-02 -3.26794162e-02
   -3.71287353e-02 -4.47329544e-02  4.42529209e-02 -2.69548055e-02
   -2.24584341e-03 -2.53444910e-02  2.24576034e-02  2.69065835e-02]
  [ 3.69108841e-03 -3.31294909e-02  2.80845910e-04 -3.00057419e-02
   -4.64703105e-02 -1.87219605e-02 -2.56056711e-03  5.97994402e-03
    3.23537253e-02  9.13666561e-03 -1.79730654e-02  2.62521394e-02
   -3.74247432e-02  9.80285555e-03 -1.97220445e-02  3.52858938e-02]
  [ 4.48628403e-02  2.60492601e-02 -1.85151920e-02  1.69372894e-02
    4.13738564e-03  4.92382385e-02  1.43

In [15]:
embeddings = embeddings.reshape(10, 16)

embeddings.shape

(10, 16)