In [8]:
from collections import defaultdict
import nltk
import pandas as pd
import numpy as np

In [9]:
def update_vocabulary(corpus, vocabulary, prefix):
    for text in corpus:
        for i, (a, b) in enumerate(nltk.ngrams(text, n=2)):
            if i == 0:  # If it's the first char of the text
                diad = (a, f"{prefix}{b}")
            else:
                diad = (f"{prefix}{a}", f"{prefix}{b}")
            vocabulary[diad] += 1


In [11]:
corpus = ["play", "playing", "played", "plays", "playful"]
vocabulary = defaultdict(int)
prefix = "##"

update_vocabulary(
    corpus,
    vocabulary,
    prefix
)

pd.Series(vocabulary).sort_values(ascending=False)

p    ##l    5
##l  ##a    5
##a  ##y    5
##y  ##i    1
##i  ##n    1
##n  ##g    1
##y  ##e    1
##e  ##d    1
##y  ##s    1
     ##f    1
##f  ##u    1
##u  ##l    1
dtype: int64

In [25]:
pair_to_merge = (pd.Series(vocabulary).sort_values(ascending=False).head(1).keys())[0]
new_token = f"{pair_to_merge[0].replace(prefix, '')}{pair_to_merge[1].replace(prefix, '')}"

vocabulary[new_token] = vocabulary[pair_to_merge]


def wordpiece_split(word, vocabulary):
    tokens = []
    i = 0
    while i < len(word):
        matched = None
        for j in range(len(word), i, -1):
            subword = word[i:j]
            if subword in vocabulary:
                matched = subword
                break
        if matched is None:
            matched = word[i]
        tokens.append(matched)
        i += len(matched)
    return tokens


new_corpus = []
for text in corpus:
    tokens = wordpiece_split(text, vocabulary=vocabulary)
    new_corpus.append(tokens)

corpus = new_corpus

In [26]:
corpus

[['pl', 'a', 'y'],
 ['pl', 'a', 'y', 'i', 'n', 'g'],
 ['pl', 'a', 'y', 'e', 'd'],
 ['pl', 'a', 'y', 's'],
 ['pl', 'a', 'y', 'f', 'u', 'l']]

In [27]:
vocabulary

defaultdict(int,
            {('p', '##l'): 5,
             ('##l', '##a'): 5,
             ('##a', '##y'): 5,
             ('##y', '##i'): 1,
             ('##i', '##n'): 1,
             ('##n', '##g'): 1,
             ('##y', '##e'): 1,
             ('##e', '##d'): 1,
             ('##y', '##s'): 1,
             ('##y', '##f'): 1,
             ('##f', '##u'): 1,
             ('##u', '##l'): 1,
             'pl': 5})

In [28]:
update_vocabulary(corpus=corpus, vocabulary=vocabulary, prefix=prefix)
pd.Series(vocabulary).sort_values(ascending=False)


(##a, ##y)    10
(p, ##l)       5
(##l, ##a)     5
pl             5
(pl, ##a)      5
(##y, ##i)     2
(##n, ##g)     2
(##i, ##n)     2
(##y, ##e)     2
(##e, ##d)     2
(##y, ##f)     2
(##y, ##s)     2
(##u, ##l)     2
(##f, ##u)     2
dtype: int64

In [29]:
class WordPieceTokenizer:

    def __init__(
            self,
            max_tokens: int
    ):
        self.max_tokens = max_tokens

        self.prefix = "#"
        self.unknown = "[UNK]"
        self.vocabulary = defaultdict(int)

    def tokenize_text(
            self,
            text: str
    ) -> list[list[str]]:

        words = text.split()
        vocab = self.init_vocab(text)

        while len(vocab) < self.max_tokens:

            # 1. Tokenize all words using current vocabulary
            corpus = [self.tokenize_with_vocab(vocab, word) for word in words]

            # 2. Get most frequent pair based on current corpus
            mfp = self.get_most_frequent_pair(corpus)

            # 3. Make the most frequent pair into a single token
            first, second = mfp
            new_token = first + second.replace(self.prefix, "")

            # 4. Add the new token to the vocabulary
            vocabulary.add(new_token)

    def init_vocab(
            self,
            text: str
    ) -> set[str]:
        """ Initialize a vocabulary based on text """
        vocab = set()
        for word in text.split():
            for idx, char in enumerate(word):
                token = char if idx == 0 else self.prefix + char
                vocab.add(token)
        return vocab

    def get_most_frequent_pair(
            self,
            corpus: list[list[str]]
    ) -> tuple[int, int]:
        pairs_counter = defaultdict(int)
        for word in corpus:
            for a, b in nltk.ngrams(word, n=2):
                pairs_counter[(a, b)] += 1
        return max(pairs_counter, key=pairs_counter.get)





IndentationError: expected an indented block after 'while' statement on line 21 (1602550312.py, line 24)