In [13]:


import re


In [21]:
import re
import collections

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.merge_rules = []

    def get_stats(self):
        pairs = collections.defaultdict(int)
        for word, freq in self.vocab.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[symbols[i], symbols[i+1]] += freq
        return pairs

    def merge_vocab(self, pair):
        v_out = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        for word in self.vocab:
            w_out = p.sub(''.join(pair), word)
            v_out[w_out] = self.vocab[word]
        return v_out

    def learn_vocabulary(self, corpus, num_merges):
        vocab = {}
        for word, freq in collections.Counter(corpus).items():
            vocab[' '.join(word) + ' </w>'] = freq
        self.vocab = vocab
        for i in range(num_merges):
            pairs = self.get_stats()
            best = max(pairs, key=pairs.get)
            self.vocab = self.merge_vocab(best)
            self.merge_rules.append(best)

    def tokenize(self, sample):
        # Tokenize based on the learnt rules
        for rule in self.merge_rules:
            sample = sample.replace(' '.join(rule), ''.join(rule))
        return sample.split()


with open('corpus.txt', 'r') as f:
    data = f.read().splitlines()

tokenizer = Tokenizer()
tokenizer.learn_vocabulary(data, 1000)


with open('tokens.txt', 'w') as f:
    for token in tokenizer.vocab.keys():
        f.write("%s\n" % token.replace(' </w>', ''))


with open('merge_rules.txt', 'w') as f:
    for rule in tokenizer.merge_rules:
        f.write(", ".join(map(str, rule)) + "\n")




In [None]:
test_samples_file = "test_samples.txt"

with open(test_samples_file, 'r') as f:
    test_samples = f.readlines()


with open('tokenized_samples.txt', 'w') as f:
    for sample in test_samples:
        f.write("%s\n" % ' '.join(tokenizer.tokenize(sample)))