<a href="https://colab.research.google.com/github/Bszolk/BytePairEncoding-for-NLP/blob/main/BytePairEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
class BytePairEncoding():
    def __init__(self):
        self.vocab: list[str] = ['<s>'] # <s> - special token representing start of the word
        self.merge_rule: dict = {}
        self.stoi: dict = {} # mapping from token (str) to int
        self.itos: dict = {} # mapping from int to token (str)

    def gen_vocab(self, corpus: list[str], vocab_size: int):
        '''
        Based on a frequency of characters in the corpus, generates a vocabulary
        of all available tokens and a set of rules used to encode plain text
        into these tokens.

            Parameters:
                corpus (list[str]): list of text sequences
                vocab_size (int): method ends when given vocab_size is reached
        '''
        [[c not in self.vocab and self.vocab.append(c) for c in seq] for seq in corpus]
        corpus = " ".join(corpus)
        corpus = [list(word) for word in corpus.split(" ")]
        [word.insert(0, self.vocab[0]) for word in corpus]

        # find most frequent pair of tokens and merge them together
        while len(self.vocab) < vocab_size:
            pair_freq = {}
            for word in corpus:
                for i in range(len(word) - 1):
                    pair = (word[i], word[i+1])
                    pair_freq[pair] = pair_freq.setdefault(pair, 0) + 1

            max_pair = max(pair_freq, key=pair_freq.get)
            pair = max_pair[0] + max_pair[1]
            self.vocab.append(pair)
            self.merge_rule[max_pair] = pair

            for k in range(len(corpus)):
                word = corpus[k]
                for i in range(len(word) - 1):
                    curr_pair = (word[i], word[i+1])
                    if curr_pair == max_pair:
                        corpus[k] = word[:i] + [word[i] + word[i+1]] + word[i+2:]

        # create a mapping between tokens and integers
        for i, token in enumerate(self.vocab):
            self.stoi[token] = i
            self.itos[i] = token

    def tokenize(self, text: str) -> list[str]:
        '''
        Encodes text into a list of tokens based on previously generated
        set of rules

            Parameters:
                text (str): plain text to encode

            Returns:
                encoded_text (list[str]): list of tokens from generated vocab
        '''
        text = [list(word) for word in text.split(" ")]
        [word.insert(0, self.vocab[0]) for word in text]

        for pair, merged in self.merge_rule.items():
            for k in range(len(text)):
                word = text[k]
                for i in range(len(word) - 1):
                    curr_pair = (word[i], word[i+1])
                    if curr_pair == pair:
                        text[k] = word[:i] + [word[i] + word[i+1]] + word[i+2:]

        return [token for word in text for token in word]

    def detokenize(self, tokens: list[str]) -> str:
        '''
        Decodes list of tokens into a plain text

            Parameters:
                tokens (list[str]): sequence of tokens

            Returns:
                text (str): decoded text
        '''
        text = "".join(tokens)
        return text.replace('<s>', ' ')

In [None]:
sample_text = ["In the realm of modern education, the impact of technology has been nothing short of revolutionary. Gone are the days of traditional blackboards and chalk; instead, interactive whiteboards and digital tablets have become commonplace in classrooms around the world. This technological revolution has not only transformed the way students learn but has also empowered educators with innovative tools to engage and inspire their students. With the advent of online learning platforms and virtual classrooms, geographical barriers have been shattered, allowing students to access quality education from anywhere with an internet connection. Furthermore, educational apps and software have revolutionized the learning experience, making it more interactive, personalized, and adaptive to individual student needs. From gamified learning modules to virtual reality simulations, technology has opened up new avenues for experiential and immersive learning, enabling students to grasp complex concepts with greater ease and retention. Moreover, the integration of artificial intelligence and machine learning algorithms has enabled educators to analyze vast amounts of student data, providing valuable insights into learning patterns and identifying areas where students may need additional support. Additionally, the rise of educational technology startups and edtech companies has spurred innovation and competition in the education sector, leading to the development of cutting-edge tools and resources for both students and teachers. However, despite the myriad benefits of technology in education, it also presents its own set of challenges and concerns. Issues such as the digital divide, unequal access to technology, and concerns about data privacy and security remain pressing issues that need to be addressed. Moreover, there is a growing concern about the potential negative impact of excessive screen time on students' mental health and well-being. Therefore, while technology undoubtedly holds great promise for the future of education, it is essential to strike a balance between harnessing its potential benefits and addressing its inherent challenges in order to ensure that all students have access to quality education in the digital age."]

bpe = BytePairEncoding()
bpe.gen_vocab(sample_text, 200)

In [45]:
# print(bpe.vocab)
# print(bpe.merge_rule)

text = "Technology has revolutionized modern education by providing interactive tools and resources"
tokenized = bpe.tokenize(text)
print(tokenized)

['<s>', 'T', 'ech', 'n', 'ol', 'og', 'y', '<s>has', '<s>revolution', 'i', 'z', 'ed', '<s>mod', 'ern', '<s>education', '<s>b', 'y', '<s>p', 'r', 'ov', 'id', 'ing', '<s>int', 'er', 'ac', 'tive', '<s>to', 'ol', 's', '<s>and', '<s>re', 's', 'ou', 'r', 'c', 'es']
