# Importing the needed libraries 

In [1]:
# Iteration tracking 
from tqdm import tqdm 

# Regular expressions
import re

# Typehinting 
from typing import Tuple

# Array math 
import numpy as np 

# Defining the text to tokenize 

In [2]:
text = "NLP in Python is fun and very well documented. Let's get started!"

# Tokenizing

In [3]:
def preprocess_text(x: str) -> str:
    """
    Function that preprocess the text before tokenization

    Args:
        x (str): text to preprocess

    Returns:
        str: preprocessed text
    """ 
    # Create whitespaces around punctuation
    x = re.sub(r'([.,!?;:])', r' \1 ', x)

    # Returns the text 
    return x

In [4]:
text_preprocessed = preprocess_text(text)
print(text_preprocessed)

NLP in Python is fun and very well documented .  Let's get started ! 


In [5]:
def create_word_index(x: str) -> Tuple[dict, dict]: 
    """
    Function that scans a given text and creates two dictionaries:
    - word2idx: dictionary mapping words to integers
    - idx2word: dictionary mapping integers to words

    Args:
        x (str): text to scan

    Returns:
        Tuple[dict, dict]: word2idx and idx2word dictionaries
    """
    # Spliting the text into words
    words = x.split()

    # Creating the word2idx dictionary 
    word2idx = {}
    for word in words: 
        if word not in word2idx: 
            # The len(word2idx) will always ensure that the 
            # new index is 1 + the length of the dictionary so far
            word2idx[word] = len(word2idx)

    # Adding the <UNK> token to the dictionary; This token will be used 
    # on new texts that were not seen during training.
    # It will have the last index. 
    word2idx['<UNK>'] = len(word2idx)

    # Reversing the above dictionary and creating the idx2word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}

    # Returns the dictionaries
    return word2idx, idx2word

In [6]:
# Creating the word2idx and idx2word dictionaries
word2idx, idx2word = create_word_index(text_preprocessed)

In [7]:
word2idx

{'NLP': 0,
 'in': 1,
 'Python': 2,
 'is': 3,
 'fun': 4,
 'and': 5,
 'very': 6,
 'well': 7,
 'documented': 8,
 '.': 9,
 "Let's": 10,
 'get': 11,
 'started': 12,
 '!': 13,
 '<UNK>': 14}

In [8]:
idx2word

{0: 'NLP',
 1: 'in',
 2: 'Python',
 3: 'is',
 4: 'fun',
 5: 'and',
 6: 'very',
 7: 'well',
 8: 'documented',
 9: '.',
 10: "Let's",
 11: 'get',
 12: 'started',
 13: '!',
 14: '<UNK>'}

In [18]:
# Defining a function that creates the embedding vector 
def create_embedding_vector(
        n_dim: int = 16, 
        mean: float = 0.0, 
        variance: float = 1.0,
        precision: int = None
        ) -> np.array: 
    """
    Function that creates a random embedding vector

    Args:
        n_dim (int, optional): embedding dimension. Defaults to 16.
        mean (float, optional): mean of the normal distribution. Defaults to 0.0.
        variance (float, optional): variance of the normal distribution. Defaults to 1.0.
        precision (int, optional): precision of each of the gotten coordinate. Defaults to None.
        
    Returns:
        np.array: embedding vector
    """
    # Creating a random normal distribution 
    X = np.random.normal(mean, variance, (n_dim, ))

    # Rounding the coordinates to the given precision
    if precision: 
        X = np.round(X, precision)

    # Returns the embedding vector 
    return X

In [19]:
# Initiating the embedding dictionary 
idx2embeddings = {}

# Iterating over the idx2word dictionary
for idx in range(len(idx2word)): 
    # Creating the embedding vector 
    X = create_embedding_vector(n_dim=6, precision=3)

    # Adding the embedding vector to the embedding dictionary
    idx2embeddings[idx] = X

# Creating the word2embeddings dictionary
word2embeddings = {word: idx2embeddings[idx] for word, idx in word2idx.items()}

In [20]:
idx2embeddings

{0: array([ 0.308, -1.003, -0.36 ,  0.57 , -1.106, -0.997]),
 1: array([-1.283,  0.709,  0.812,  0.201,  0.339,  1.264]),
 2: array([ 1.095, -0.666,  1.32 , -0.668, -0.705,  0.311]),
 3: array([ 0.417,  1.088,  1.242,  0.905, -0.061,  1.316]),
 4: array([ 1.432, -0.072,  0.622, -0.077,  0.597,  0.722]),
 5: array([-0.724, -0.496, -1.652,  1.118, -2.108, -0.996]),
 6: array([ 0.733,  0.021,  0.972,  0.363,  0.074, -0.661]),
 7: array([-0.284,  1.453,  2.522,  1.027, -1.484,  0.301]),
 8: array([ 0.921,  0.19 ,  0.068,  0.517, -0.767,  0.225]),
 9: array([-0.317,  0.691, -1.281,  0.624,  2.004,  1.377]),
 10: array([ 0.171,  0.607, -1.064, -0.064, -0.091,  0.748]),
 11: array([-0.151,  1.137,  0.783, -0.689, -1.473, -0.753]),
 12: array([1.699, 0.021, 1.422, 0.316, 0.317, 0.064]),
 13: array([-0.804,  0.156, -0.298,  0.15 , -0.686,  0.752]),
 14: array([ 0.538,  0.405,  0.501, -0.245, -1.946,  0.282])}

In [21]:
word2embeddings

{'NLP': array([ 0.308, -1.003, -0.36 ,  0.57 , -1.106, -0.997]),
 'in': array([-1.283,  0.709,  0.812,  0.201,  0.339,  1.264]),
 'Python': array([ 1.095, -0.666,  1.32 , -0.668, -0.705,  0.311]),
 'is': array([ 0.417,  1.088,  1.242,  0.905, -0.061,  1.316]),
 'fun': array([ 1.432, -0.072,  0.622, -0.077,  0.597,  0.722]),
 'and': array([-0.724, -0.496, -1.652,  1.118, -2.108, -0.996]),
 'very': array([ 0.733,  0.021,  0.972,  0.363,  0.074, -0.661]),
 'well': array([-0.284,  1.453,  2.522,  1.027, -1.484,  0.301]),
 'documented': array([ 0.921,  0.19 ,  0.068,  0.517, -0.767,  0.225]),
 '.': array([-0.317,  0.691, -1.281,  0.624,  2.004,  1.377]),
 "Let's": array([ 0.171,  0.607, -1.064, -0.064, -0.091,  0.748]),
 'get': array([-0.151,  1.137,  0.783, -0.689, -1.473, -0.753]),
 'started': array([1.699, 0.021, 1.422, 0.316, 0.317, 0.064]),
 '!': array([-0.804,  0.156, -0.298,  0.15 , -0.686,  0.752]),
 '<UNK>': array([ 0.538,  0.405,  0.501, -0.245, -1.946,  0.282])}

In [26]:
# Creating the embedding matrix
embedding_matrix = np.array([idx2embeddings[idx] for idx in idx2embeddings])

# Printing the matrix
print(embedding_matrix)

# Printing the shape
print(embedding_matrix.shape)

[[ 0.308 -1.003 -0.36   0.57  -1.106 -0.997]
 [-1.283  0.709  0.812  0.201  0.339  1.264]
 [ 1.095 -0.666  1.32  -0.668 -0.705  0.311]
 [ 0.417  1.088  1.242  0.905 -0.061  1.316]
 [ 1.432 -0.072  0.622 -0.077  0.597  0.722]
 [-0.724 -0.496 -1.652  1.118 -2.108 -0.996]
 [ 0.733  0.021  0.972  0.363  0.074 -0.661]
 [-0.284  1.453  2.522  1.027 -1.484  0.301]
 [ 0.921  0.19   0.068  0.517 -0.767  0.225]
 [-0.317  0.691 -1.281  0.624  2.004  1.377]
 [ 0.171  0.607 -1.064 -0.064 -0.091  0.748]
 [-0.151  1.137  0.783 -0.689 -1.473 -0.753]
 [ 1.699  0.021  1.422  0.316  0.317  0.064]
 [-0.804  0.156 -0.298  0.15  -0.686  0.752]
 [ 0.538  0.405  0.501 -0.245 -1.946  0.282]]
(15, 6)
