# Importing the needed libraries 

In [42]:
# Iteration tracking 
from tqdm import tqdm 

# Regular expressions
import re

# Typehinting 
from typing import Tuple

# Defining the text to tokenize 

In [43]:
text = "NLP in Python is fun and very well documented. Let's get started!"

# Tokenizing

In [44]:
def preprocess_text(x: str) -> str:
    """
    Function that preprocess the text before tokenization

    Args:
        x (str): text to preprocess

    Returns:
        str: preprocessed text
    """ 
    # Create whitespaces around punctuation
    x = re.sub(r'([.,!?;:])', r' \1 ', x)

    # Returns the text 
    return x

In [45]:
text_preprocessed = preprocess_text(text)
print(text_preprocessed)

NLP in Python is fun and very well documented .  Let's get started ! 


In [46]:
def create_word_index(x: str) -> Tuple[dict, dict]: 
    """
    Function that scans a given text and creates two dictionaries:
    - word2idx: dictionary mapping words to integers
    - idx2word: dictionary mapping integers to words

    Args:
        x (str): text to scan

    Returns:
        Tuple[dict, dict]: word2idx and idx2word dictionaries
    """
    # Spliting the text into words
    words = x.split()

    # Creating the word2idx dictionary 
    word2idx = {}
    for word in words: 
        if word not in word2idx: 
            # The len(word2idx) will always ensure that the 
            # new index is 1 + the length of the dictionary so far
            word2idx[word] = len(word2idx)

    # Adding the <UNK> token to the dictionary; This token will be used 
    # on new texts that were not seen during training.
    # It will have the last index. 
    word2idx['<UNK>'] = len(word2idx)

    # Reversing the above dictionary and creating the idx2word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}

    # Returns the dictionaries
    return word2idx, idx2word

In [47]:
# Creating the word2idx and idx2word dictionaries
word2idx, idx2word = create_word_index(text_preprocessed)

In [48]:
word2idx

{'NLP': 0,
 'in': 1,
 'Python': 2,
 'is': 3,
 'fun': 4,
 'and': 5,
 'very': 6,
 'well': 7,
 'documented': 8,
 '.': 9,
 "Let's": 10,
 'get': 11,
 'started': 12,
 '!': 13,
 '<UNK>': 14}

In [49]:
idx2word

{0: 'NLP',
 1: 'in',
 2: 'Python',
 3: 'is',
 4: 'fun',
 5: 'and',
 6: 'very',
 7: 'well',
 8: 'documented',
 9: '.',
 10: "Let's",
 11: 'get',
 12: 'started',
 13: '!',
 14: '<UNK>'}

In [50]:
# Defining a function that splits a text into tokens 
def text2tokens(x: str, word2idx: dict) -> list: 
    """
    Function that tokenizes a text

    Args:
        x (str): text to tokenize
        word2idx (dict): word2idx dictionary

    Returns:
        list: list of tokens
    """
    # Spliting the text into words
    words = x.split()

    # Creating the list of tokens
    tokens = []
    for word in words: 
        # The bellow line searches for the word in the word2idx dictionary
        # and returns the index of the word. If the word is not found,
        # it returns the index of the <UNK> token
        tokens.append(word2idx.get(word, word2idx['<UNK>']))

    # Returns the list of tokens
    return tokens

# Defining a function that converts the tokens to text 
def tokens2text(x: list, idx2word: dict) -> str:
    """
    Function that converts tokens to text

    Args:
        x (list): list of tokens
        idx2word (dict): idx2word dictionary

    Returns:
        str: text
    """
    # Creating the list of words
    words = []
    for idx in x: 
        # The bellow line searches for the index in the idx2word dictionary
        # and returns the word. If the index is not found,
        # it returns the <UNK> token
        words.append(idx2word.get(idx, '<UNK>'))

    # Returns the text
    return ' '.join(words)

In [51]:
# Applying the text2tokens function to the text
tokens_seq = text2tokens(text_preprocessed, word2idx)

# Transforming the tokens back to text
text_seq = tokens2text(tokens_seq, idx2word)

In [52]:
tokens_seq

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [53]:
text_seq

"NLP in Python is fun and very well documented . Let's get started !"

In [56]:
# Putting everything together with a new text
text = "As I said, Python is a very good tool for NLP"

# Preprocessing the text
text_preprocessed = preprocess_text(text)

# Applying the text2tokens function to the text
tokens_seq = text2tokens(text_preprocessed, word2idx)

# Transforming the tokens back to text
text_seq = tokens2text(tokens_seq, idx2word)

print(f"Original text:\n {text_preprocessed}")
print(f"Tokens:\n {tokens_seq}")
print(f"Text:\n {text_seq}")

Original text:
 As I said ,  Python is a very good tool for NLP
Tokens:
 [14, 14, 14, 14, 2, 3, 14, 6, 14, 14, 14, 0]
Text:
 <UNK> <UNK> <UNK> <UNK> Python is <UNK> very <UNK> <UNK> <UNK> NLP
