**Importing packages**

In [1]:
from typing import Dict, Tuple
import re

# Find all unique letters in the corpus

In [2]:
def Find_Vocabulary(Dictionary):
    Vocabulary = []
    for word in Dictionary:
        for string in word:
            Vocabulary.append(string)
    Vocabulary = list(dict.fromkeys(Vocabulary))
    Vocabulary.remove(' ')
    return Vocabulary

# Find all most frequent pairs in the corpus

In [3]:
def Find_Most_Frequent_Pair(My_Dictionary: Dict[str, int]) -> Dict[Tuple[str, str], int]:
    Pair = {}
    for Vocab, Frequency in My_Dictionary.items():
        Letter = Vocab.split()
        for i in range(len(Letter) - 1):
            pair = (Letter[i], Letter[i + 1])
            New_Frequency = Pair.get(pair, 0)
            Pair[pair] = New_Frequency + Frequency
    return Pair

# Merge most frequent Consecutive letters into one word

In [4]:
def Combine_Vocabulary(Most_Frequent_Pair: Tuple[str, str], Input_Vocabulary: Dict[str, int]) -> Dict[str, int]:
    Merged_Vocabulary = {}
    pattern = re.escape(' '.join(Most_Frequent_Pair))
    Substitution = ''.join(Most_Frequent_Pair)
    for Input_Word in Input_Vocabulary:
        Output_Word = re.sub(pattern, Substitution, Input_Word)
        Merged_Vocabulary[Output_Word] = Input_Vocabulary[Input_Word]
    return Merged_Vocabulary

# Main {

In [6]:
Dictionary = {
    'l o w _': 5,
    'l o w e r _': 2,
    'w i d e s t _': 3,
    'n e w e s t _': 5
}

Vocabulary = Find_Vocabulary(Dictionary)
Number_Of_Iteration = 0
Number_Of_Final_Vocab = 26
# Limitation for the number of final vocabulary
while len(Vocabulary)<Number_Of_Final_Vocab:

    print('Step:', Number_Of_Iteration)
    pair_stats = Find_Most_Frequent_Pair(Dictionary)
    if not pair_stats:
        break
    Most_Frequent_Pair = max(pair_stats, key=pair_stats.get)
    print('\nVocabulary: ', Vocabulary)
    Vocabulary.append(Most_Frequent_Pair[0]+Most_Frequent_Pair[1])
    print('Dictionary: ', Dictionary)
    print('Combined pairs:', Most_Frequent_Pair,'\n--------------')
    Dictionary = Combine_Vocabulary(Most_Frequent_Pair, Dictionary)
    Number_Of_Iteration = Number_Of_Iteration + 1

print('\nFinal Dictionary: ', Dictionary)
print('Final Vocabulary: ', Vocabulary)

Step: 0

Vocabulary:  ['l', 'o', 'w', '_', 'e', 'r', 'i', 'd', 's', 't', 'n']
Dictionary:  {'l o w _': 5, 'l o w e r _': 2, 'w i d e s t _': 3, 'n e w e s t _': 5}
Combined pairs: ('e', 's') 
--------------
Step: 1

Vocabulary:  ['l', 'o', 'w', '_', 'e', 'r', 'i', 'd', 's', 't', 'n', 'es']
Dictionary:  {'l o w _': 5, 'l o w e r _': 2, 'w i d es t _': 3, 'n e w es t _': 5}
Combined pairs: ('es', 't') 
--------------
Step: 2

Vocabulary:  ['l', 'o', 'w', '_', 'e', 'r', 'i', 'd', 's', 't', 'n', 'es', 'est']
Dictionary:  {'l o w _': 5, 'l o w e r _': 2, 'w i d est _': 3, 'n e w est _': 5}
Combined pairs: ('est', '_') 
--------------
Step: 3

Vocabulary:  ['l', 'o', 'w', '_', 'e', 'r', 'i', 'd', 's', 't', 'n', 'es', 'est', 'est_']
Dictionary:  {'l o w _': 5, 'l o w e r _': 2, 'w i d est_': 3, 'n e w est_': 5}
Combined pairs: ('l', 'o') 
--------------
Step: 4

Vocabulary:  ['l', 'o', 'w', '_', 'e', 'r', 'i', 'd', 's', 't', 'n', 'es', 'est', 'est_', 'lo']
Dictionary:  {'lo w _': 5, 'lo w e r

# Tokenize Out of Vocabulary

In [7]:
input = 'lowest'
input = input + '_'
Output_Pairs = []
Intended_Pairs = []
for Vocab in Vocabulary:
  if Vocab in input:
    Output_Pairs.append(Vocab)
for char1 in Output_Pairs:
  for char2 in Output_Pairs:
    if char1+char2 == input:
      Intended_Pairs = [char1,char2]
    if char2+char1 == input:
      Intended_Pairs = [char2,char1]
print('Input Out of Vocabulary is : ',input)
print('Intended Pairs are : ',Intended_Pairs)

Input Out of Vocabulary is :  lowest_
Intended Pairs are :  ['low', 'est_']
