In [913]:
from collections import defaultdict
from string import punctuation
import nltk

class BPETokenizer:
    def __init__(self):
        self.word_vocab = {}
        self.token_vocab = set()

    def get_vocab(self, text):
        return set(char for word in text.translate(str.maketrans('', '', punctuation)).split() for char in word)

    def count_words(self, text):
        freqs = defaultdict(int)
        clean_text = text.translate(str.maketrans('', '', punctuation))
        
        for word in clean_text.split():
            word += "_"
            freqs[word] += 1
        
        return freqs

    def find_pairs(self, word_dict):
        pairs = defaultdict(int)
        
        for word, count in word_dict.items():
            tokens = word.split()
            for i in range(len(tokens) - 1):
                pair = (tokens[i], tokens[i + 1])
                pairs[pair] += count
                
        return pairs

    def merge_pair(self, word_dict, pair):
        new_dict = defaultdict(int)
        old = " ".join(pair)
        new = "".join(pair)
        
        for word, count in word_dict.items():
            merged_word = word.replace(old, new)
            new_dict[merged_word] += count
            
        return new_dict

    def train(self, text, num_merges):
        self.token_vocab = self.get_vocab(text)
        
        word_freqs = self.count_words(text)
        self.word_vocab = {' '.join(word): freq for word, freq in word_freqs.items()}
        
        for i in range(num_merges):
            pair_freqs = self.find_pairs(self.word_vocab)
            
            if not pair_freqs:
                break
                
            best_pair = max(pair_freqs.items(), key=lambda x: x[1])[0]
            merged_token = ''.join(best_pair)
            
            self.token_vocab.add(merged_token)
            self.word_vocab = self.merge_pair(self.word_vocab, best_pair)
        
        return self.word_vocab, self.token_vocab

In [914]:
if __name__ == "__main__":
    nltk.download('gutenberg')
    from nltk.corpus import gutenberg
    book1 = gutenberg.raw("austen-emma.txt")
    book2 = gutenberg.raw("blake-poems.txt")
    book3 = gutenberg.raw("shakespeare-hamlet.txt")
    
    print(book1)
    
    # print(f"\nFinal word vocabulary: {final_word_vocab}")
    # print(f"Complete token vocabulary: {complete_vocab}")
    # print(f"Vocabulary size: {len(complete_vocab)}")

Available Books:


[nltk_data] Error loading gutenberg: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


OSError: No such file or directory: '/Users/dhruvgorasiya/nltk_data/corpora/gutenberg/hakespeare-hamlet.txt'