In [13]:


import re


In [41]:
import re
import collections

# Tokenizer class 
class Tokenizer:
    def __init__(self):
        # dictionary to store token frequencies
        self.vocab = {}
        # List to store merge rules during vocabulary learning
        self.merge_rules = []

    # Calculate the count of pairs of symbols in the vocabulary
    def get_stats(self):
        # Dictionary to store pair frequencies
        pairs = collections.defaultdict(int)
        # Iterating over words in the vocabulary
        for word, freq in self.vocab.items():
            symbols = word.split()
            # Iterating over pairs of symbols in each word
            for i in range(len(symbols)-1):
                pairs[symbols[i], symbols[i+1]] += freq
        return pairs

    # Merging a pair of symbols in the vocabulary
    def merge_vocab(self, pair):
        # initalizing an empty dictionary to store new vocabulary
        v_out = {}
        # new re expression to merge the pair
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        # Iterating over words in the current vocabulary
        for word in self.vocab:
            # Apply the merge rule to each word
            w_out = p.sub(''.join(pair), word)
            v_out[w_out] = self.vocab[word]
        return v_out

    # Learning the vocabulary from the coprus using BPE
    def learn_vocabulary(self, corpus, num_merges):
        # Creating an empty dictionary to store the vocabulary
        vocab = {}
        # Convert words in the corpus into BPE tokens and count frequencies
        for word, freq in collections.Counter(corpus).items():
            vocab[' '.join(word) + ' </w>'] = freq
        self.vocab = vocab
        # Iterating over the number of merges
        for i in range(num_merges):
            # pair frequencies
            pairs = self.get_stats()
            # most frequent pair
            best = max(pairs, key=pairs.get)
            # Merge the most frequent pair
            self.vocab = self.merge_vocab(best)
            # Store the merge rule 
            self.merge_rules.append(best)

    # tokenizing a sample using the learned vocabulary
    def tokenize(self, sample):
        # applying the leared rules
        for rule in self.merge_rules:
            sample = sample.replace(' '.join(rule), ''.join(rule))
        return sample.split()

# Open and read the input data from a file
with open('test_c.txt', 'r') as f:
    data = f.read().splitlines()

# instance of the Tokenizer class
tokenizer = Tokenizer()
# Learn vocabulary from the corpus
tokenizer.learn_vocabulary(data, 100)

# Write the learned vocabulary to a file
with open('tokens.txt', 'w') as f:
    for token in tokenizer.vocab.keys():
        f.write("%s\n" % token.replace(' </w>', '').replace(' ', '\n'))

# Write the learned merge rules to a file
with open('merge_rules.txt', 'w') as f:
    for rule in tokenizer.merge_rules:
        f.write(", ".join(map(str, rule)) + "\n")


In [30]:
test_samples_file = "test_ex.txt"

with open(test_samples_file, 'r') as f:
    test_samples = f.readlines()


with open('tokenized_samples.txt', 'w') as f:
    for sample in test_samples:
        f.write("%s\n" % ' '.join(tokenizer.tokenize(sample)))