In [13]:


import re


In [11]:
import re, collections

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.merge_rules = []

    def get_stats(self):
        pairs = collections.defaultdict(int)
        for word, freq in self.vocab.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[symbols[i],symbols[i+1]] += freq
        return pairs

    def merge_vocab(self, pair):
        v_out = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        for word in self.vocab:
            w_out = p.sub(''.join(pair), word)
            v_out[w_out] = self.vocab[word]
        return v_out

    def learn_vocablury(self, corpus, num_merges):
        self.vocab = {(' '.join(word) + ' </w>'): freq for word, freq in collections.Counter(corpus).items()}
        for i in range(num_merges):
            pairs = self.get_stats()
            best = max(pairs, key=pairs.get)
            self.vocab = self.merge_vocab(best)
            self.merge_rules.append(best)

    def tokenize(self, sample):
        return sample.split()

    def write_to_file(self, filename, data):
        with open(filename, 'w') as f:
            for item in data:
                f.write("%s\n" % item)


with open('corpus.txt', 'r') as f:
    data = f.read().splitlines()


tokenizer = Tokenizer()
tokenizer.learn_vocablury(data, 100)


tokenizer.write_to_file('tokens.txt', [token.replace(' </w>', '') for token in tokenizer.vocab.keys()])


tokenizer.write_to_file('merge_rules.txt', [', '.join(map(str, rule)) for rule in tokenizer.merge_rules])


test_samples = ["this is an nlp course", "i love nlp"]
tokenized_samples = [tokenizer.tokenize(sample) for sample in test_samples]
tokenizer.write_to_file('tokenized_samples.txt', [', '.join(tokens) for tokens in tokenized_samples])
