In [None]:
# Install packages
%pip install Levenshtein
%pip install matplotlib
%pip install torch==2.3.0 torchtext==0.18.0

### Importing required libraries


In [None]:
import os
import sys
import time
import warnings
from pathlib import Path
import matplotlib.pyplot as plt

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests

from Levenshtein import distance
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [3]:
# Device for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
split = 'train'

# Training parameters
learning_rate = 3e-4
batch_size = 64
max_iters = 5000              # Maximum training iterations
eval_interval = 200           # Evaluate model every 'eval_interval' iterations in the training loop
eval_iters = 100              # When evaluating, approximate loss using 'eval_iters' batches

# Architecture parameters
max_vocab_size = 256          # Maximum vocabulary size
vocab_size = max_vocab_size   # Real vocabulary size (e.g. BPE has a variable length, so it can be less than 'max_vocab_size')
block_size = 16               # Context length for predictions
n_embd = 32                   # Embedding size
num_heads = 2                 # Number of head in multi-headed attention
n_layer = 2                   # Number of Blocks
ff_scale_factor = 4           # Note: The '4' magic number is from the paper: In equation 2 uses d_model=512, but d_ff=2048
dropout = 0.0                 # Normalization using dropout# 10.788929 M parameters

head_size = n_embd // num_heads
assert (num_heads * head_size) == n_embd

Following the parameter setup, you will create a function defined as `plot_embeddings`, which is designed to visualize the learned embeddings in a 3D space using `matplotlib`. This helps in understanding how the embeddings cluster and separate different tokens, providing insight into what the model has learned.


In [4]:
def plot_embdings(my_embdings,name,vocab):

  fig = plt.figure()
  ax = fig.add_subplot(111, projection='3d')

  # Plot the data points
  ax.scatter(my_embdings[:,0], my_embdings[:,1], my_embdings[:,2])

  # Label the points
  for j, label in enumerate(name):
      i=vocab.get_stoi()[label]
      ax.text(my_embdings[j,0], my_embdings[j,1], my_embdings[j,2], label)

  # Set axis labels
  ax.set_xlabel('X Label')
  ax.set_ylabel('Y Label')
  ax.set_zlabel('Z Label')

  # Show the plot
  plt.show()

In [5]:
dictionary = {
    'le': 'the'
    , 'chat': 'cat'
    , 'est': 'is'
    , 'sous': 'under'
    , 'la': 'the'
    , 'table': 'table'
}

### Define 'vocabularies'


In [10]:
# Create and sort the input vocabulary from the dictionary's keys
vocabulary_in = torch.sorted(list(set(dictionary.keys())))
# Display the size and the sorted vocabulary for the input language
print(vocabulary_in)

# Create and sort the output vocabulary from the dictionary's values
vocabulary_out = torch.sorted(list(set(dictionary.values())))
# Display the size and the sorted vocabulary for the output language
print(vocabulary_out)

Vocabulary input (6): ['chat', 'est', 'la', 'le', 'sous', 'table']
Vocabulary output (5): ['cat', 'is', 'table', 'the', 'under']


### Encode tokens using 'one hot' encoding


This code snippet focuses on creating and one-hot encoding vocabularies for both the input and output languages based on a predefined dictionary. The process begins with extracting the keys and values from the dictionary, resulting in two distinct vocabularies: one for the source language (French) and one for the target language (English).

First, the unique words from the input dictionary are sorted and stored in vocabulary_in, and its size is printed. Similarly, the output vocabulary, vocabulary_out, is created from the dictionary values and displayed.

The core functionality lies in the encode_one_hot function, which transforms the vocabulary into one-hot encoded vectors. For a vocabulary of size 

N, each word is represented as a vector of length 

N where only one element is set to 1 (indicating the presence of that word) and all other elements are 0. Mathematically, the one-hot encoding for a word 
𝑤
𝑖
w 
i
​
  in a vocabulary of size 

N can be expressed as:

𝐸
(
𝑤
𝑖
)


During the encoding process, a zero vector is initialized for each word in the vocabulary. The vector is then updated by setting the position corresponding to the word’s index to 1. Each word and its corresponding one-hot encoded vector are printed for visualization, providing insight into the representation.

The one-hot encoding is applied to both the input vocabulary and the output vocabulary, resulting in two dictionaries: one_hot_in and one_hot_out. This representation is crucial for feeding the encoded inputs into neural network models, as it allows the model to process textual data in a format suitable for mathematical computations.

In [11]:
# Function to convert a list of vocabulary words into one-hot encoded vectors
def encode_one_hot(vocabulary):
    
    

    
    return   # Return the dictionary of words and their one-hot encoded vectors

In [12]:
# Apply the one-hot encoding function to the input vocabulary and store the result
one_hot_in = 

chat	: tensor([1., 0., 0., 0., 0., 0.])
est	: tensor([0., 1., 0., 0., 0., 0.])
la	: tensor([0., 0., 1., 0., 0., 0.])
le	: tensor([0., 0., 0., 1., 0., 0.])
sous	: tensor([0., 0., 0., 0., 1., 0.])
table	: tensor([0., 0., 0., 0., 0., 1.])


In [13]:
# Iterate over the one-hot encoded input vocabulary and print each vector
# This visualizes the one-hot representation for each word in the input vocabulary


E_{ chat } =  tensor([1., 0., 0., 0., 0., 0.])
E_{ est } =  tensor([0., 1., 0., 0., 0., 0.])
E_{ la } =  tensor([0., 0., 1., 0., 0., 0.])
E_{ le } =  tensor([0., 0., 0., 1., 0., 0.])
E_{ sous } =  tensor([0., 0., 0., 0., 1., 0.])
E_{ table } =  tensor([0., 0., 0., 0., 0., 1.])


In [14]:
# Apply the one-hot encoding function to the output vocabulary and store the result
# This time we're encoding the target language vocabulary
one_hot_out = 

cat	: tensor([1., 0., 0., 0., 0.])
is	: tensor([0., 1., 0., 0., 0.])
table	: tensor([0., 0., 1., 0., 0.])
the	: tensor([0., 0., 0., 1., 0.])
under	: tensor([0., 0., 0., 0., 1.])


### Let's create a 'dictionary' using matrix multiplication

We're now illustrating how to create a representation of our dictionary suitable for neural network operations:

- **Matrix creation**: Using PyTorch's `torch.stack`, convert the one-hot encoded vectors for both input (`K`) and output (`V`) vocabularies into tensors. `K` is constructed from the input vocabulary's one-hot vectors, and `V` from the output vocabulary's vectors. These tensors can be thought of as a look-up table that our model will use to associate input tokens with output tokens.

- **Dictionary as matrices**: This step effectively translates our word-to-word dictionary mapping into a neural network-friendly format. Each row in `K` corresponds to a word in the input language represented as a one-hot vector, and each row in `V` corresponds to the respective translated word in the output language.

- **Query example**: An example shows how to use matrix operations to find a translation. Look up the one-hot vector for the word "sous" from the input vocabulary (`q`). Then demonstrate how to find its corresponding translation by performing matrix multiplication with the transpose of `K` (i.e., `q @ K.T`) to identify the index and then use that index to select the relevant row from `V`. This process mimics the lookup the you would perform in an actual neural network during translation tasks.

This matrix representation is a precursor to understanding how more complex neural network architectures, like those using self-attention, manage token translations.


In [15]:
# Stacking the one-hot encoded vectors for input vocabulary to form a tensor
K = 
# K now represents a matrix of one-hot vectors for the input vocabulary

# Display the tensor for verification
print(K)

tensor([[0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.]])


In [16]:
# Similarly, stack the one-hot encoded vectors for output vocabulary to form a tensor
V =
# V represents the corresponding matrix of one-hot vectors for the output vocabulary

# Display the tensor for verification
print(V)

tensor([[0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.]])


In [17]:
# Demonstrating how to look up a translation for a given word using matrix operations
# Here, we take the one-hot representation of 'sous' from the input vocabulary
q = 
# Display the query token vector
print("Query token :", q)

Query token : tensor([0., 0., 0., 0., 1., 0.])


In [18]:
kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk

Select key (K) : tensor([0., 0., 0., 1., 0., 0.])


In [19]:
# Use the index found from the key selection to find the corresponding value vector in V (output dictionary matrix)
# This operation selects the row from V that is the translation of 'sous' in the output vocabulary
print("Select value (V):", )

# The final output demonstrates how 'sous' can be translated using the neural network approach

Select value (V): tensor([0., 0., 0., 0., 1.])


The code introduces a function for decoding one-hot vectors to tokens and updates the translation function to utilize matrix multiplication:

### Decode one-hot vector
The `decode_one_hot` function is designed to decode a one-hot encoded vector back into the corresponding token (word). It does this by finding the token whose one-hot representation has the highest cosine similarity with the given vector, which is effectively just the dot product due to the nature of one-hot vectors.


In [20]:
def decode_one_hot(one_hot, vector):


    return   # Return the token corresponding to the one-hot vector

### Matrix-based translate function
The `translate` function now leverages matrix operations to perform the translation. For each token in the input sentence, it finds its one-hot vector, multiplies it with the matrices `K.T` and `V` to find the corresponding one-hot vector in the output vocabulary, and then decodes this vector to get the translated word.


In [21]:
def translate(sentence):


    return   # Return the translated sentence

### Translation test
The improved translate function is tested with the sentence "le chat est sous la table", verifying that it correctly translates to "the cat is under the table" using the matrix operations for a seamless word-by-word translation.


In [22]:
translate("le chat est sous la table")

'the cat is under the table'


This enhanced approach shows how neural network models can translate languages by representing the translation dictionary as matrices and using vector operations.


**The next code segment introduces concepts that lead up to the implementation of "Attention" in neural networks:**


### Softmax function for similarity
It is explained that similar tokens will have similar vectors, and a softmax function is added to the equation. This function is applied to the output of the matrix multiplication of the query vector `q` and the transpose of the matrix `K`. The softmax function converts these values into probabilities, emphasizing the most similar token while still considering the others.


In [23]:
print('E_{table} = ',)

E_{table} =  tensor([0., 0., 0., 0., 0., 1.])


### Translation with attention mechanism
The `translate` function is modified to use the softmax function as a way of applying attention. It first finds the one-hot vector for the token, then applies the softmax function to the dot product of `q` and `K.T`, scales it by the square root of the dimensionality (for normalization purposes), and finally multiplies this by `V` to get the output vector.


In [24]:
def translate(sentence):

    return   # Return the translated sentence

# Test the translate function
translate("le chat est sous la table")

'the cat is under the table'

**Test Translation**: The updated translate function is tested to ensure it correctly processes the sample sentence "le chat est sous la table", translating it to "the cat is under the table". This verifies that the attention mechanism implemented using softmax works as intended.

This step marks the progression from simple look-up-based translation to an attention-based approach, introducing students to a key component of modern neural translation models.


**The next part of the code demonstrates an improvement in the translation process by handling all queries in parallel:**


### Creating the 'Q' matrix
The matrix `Q` is constructed by stacking the one-hot encoded vectors of all tokens in the input sentence. This parallelizes the process of preparing the query vectors, which is more efficient than doing it sequentially.


In [25]:
# The sentence we want to translate
sentence = "le chat est sous la table"

# Stack all the one-hot encoded vectors for the tokens in the sentence to form the Q matrix
Q = 

# Display the Q matrix
print(Q)

tensor([[0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.]])


### Updated translate function
The `translate` function is revised to use matrix multiplication across the entire sentence. Instead of translating word by word, it now uses the "Q" matrix to perform the operation in parallel for all words.


In [26]:
def translate(sentence):

    return 
    
# Test the function to ensure it produces the correct translation
translate("le chat est sous la table")

'the cat is under the table'

- **Efficiency improvement**: By applying operations to the entire sentence at once, this approach simulates a key aspect of the actual attention mechanism used in neural networks, which is processing multiple components of input data in parallel for faster computation.

- **Test output**: The updated function correctly translates the French sentence "le chat est sous la table" to "the cat is under the table", confirming that the parallelization works effectively.

This optimization hints at the computational advantages of matrix operations in neural networks, particularly for tasks like translation which benefit from parallel processing.
