In [1]:
import json
import sys
from bpe.encoder import Encoder
from nltk.tokenize import wordpunct_tokenize

In [2]:
corpus_path = "test_text.txt"

In [3]:
with open(corpus_path, encoding="utf8") as infile:
    lines = list(map(str.strip, infile))

In [4]:
def count_word_ngrams(vocab):
    ngrams_count = {}
    for k, _ in vocab.items():
        if k != '__eow' and k != '__sow':
            length = len(wordpunct_tokenize(k))
            if length not in ngrams_count:
                ngrams_count[length] = 1
            else:
                ngrams_count[length] += 1
    return ngrams_count

def count_char_ngrams(vocab):
    ngrams_count = {}
    for k, _ in vocab.items():
        if k != '__eow' and k != '__sow':
            length = len(k)
            if length not in ngrams_count:
                ngrams_count[length] = 1
            else:
                ngrams_count[length] += 1
    return ngrams_count

## Text encoder (word ngrams and char-ngrams)

In [None]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=True, vocab_size=1000, pct_bpe=0.9, word_ngram_max=3, ngram_max=5)
encoder.fit(lines)
text = "There is a leader and he is winner in sports. inner city is a level of the 'Earthworm Jim' game"
print(encoder.transform(text))

In [None]:
list(encoder.word_vocab.items())[:10]

In [None]:
list(encoder.bpe_vocab.items())[:10]

In [None]:
for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))

## Default encoder (words and char-ngrams)

In [None]:
print(lines[0])

In [None]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=True, ngram_max=4, vocab_size=100, pct_bpe=0.9)
encoder.fit(lines[0])
text = "There is a leader and he is winner"
print(encoder.transform(text))

In [None]:
list(encoder.word_vocab.items())

In [None]:
list(encoder.bpe_vocab.items())[0:10]

In [None]:
for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))

## Encoder (words only)

In [None]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=False)
encoder.fit(lines[0])
text = "There is a leader and he is winner"
print(encoder.transform(text))

In [None]:
list(encoder.word_vocab.items())[2:12]

In [None]:
list(encoder.bpe_vocab.items())[0:5]

In [None]:
for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))

## Encoder (words ngrams only)

In [None]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=False)
encoder.fit(lines)
text = "There is a leader and he is winner"
print(encoder.transform(text))

In [None]:
list(encoder.word_vocab.items())[2:12]

In [None]:
list(encoder.bpe_vocab.items())[2:12]

In [None]:
for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))

# Sample tests

## Multiple word n-grams, Multiple character n-grams

In [5]:
train_text = [
    "Hello world from earth",
    "Hello George who has two ears",
    "Hello Georgia which has a big capital city",
    "A litte George said hello",
    "Linking ideas in domain is a good idea"
]

In [6]:
train_text_single = "Hello world from earth. Hello George who has two ears. Hello Georgia which has a big capital city. A litte George said hello. Linking ideas in domain is a good idea"

In [7]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=True, vocab_size=30, pct_bpe=0.8, ngram_min=2, ngram_max=4, word_ngram_min=2)
encoder.fit(train_text)
text = "Say Hello to guys from Georgia"
print(encoder.transform(text))

print()

for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))
    
print()

print("Word vocabulary - {}".format(encoder.word_vocab))
print("Ngrams vocabulary - {}".format(encoder.bpe_vocab))

['__sow', 'S', 'a', 'y', '__eow', '__sow', 'H', 'ello', '__eow', '__sow', 't', 'o', '__eow', '__sow', 'g', 'u', 'y', 's', '__eow', '__sow', 'f', 'r', 'o', 'm', '__eow', '__sow', 'Geor', 'g', 'i', 'a', '__eow']

Word ngram size 2: 5 times

Char ngram size 2: 14 times
Char ngram size 3: 7 times
Char ngram size 4: 4 times

Word vocabulary - {'Hello world': 0, 'world from': 1, 'from earth': 2, 'Hello George': 3, 'George who': 4}
Ngrams vocabulary - {'in': 5, 'ea': 6, 'id': 7, 'it': 8, 'as': 9, 'ai': 10, 'de': 11, 'ide': 12, 'dea': 13, 'idea': 14, 'Ge': 15, 'eo': 16, 'or': 17, 'rg': 18, 'Geo': 19, 'eor': 20, 'org': 21, 'Geor': 22, 'eorg': 23, 'el': 24, 'll': 25, 'lo': 26, 'ell': 27, 'llo': 28, 'ello': 29}


## Word n-grams, Multiple character n-grams

In [16]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=True, vocab_size=30, pct_bpe=0.9, ngram_min=2, ngram_max=5)
encoder.fit(train_text_single)
text = "Say Hello to guys from Georgia"
print(encoder.transform(text))

print()

for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))
    
print()

print("Word vocabulary - {}".format(encoder.word_vocab))
print("Ngrams vocabulary - {}".format(encoder.bpe_vocab))

['__sow', 'S', 'a', 'y', '__eow', 'Hello', '__sow', 't', 'o', '__eow', '__sow', 'g', 'u', 'y', 's', '__eow', '__sow', 'f', 'r', 'om', '__eow', '__sow', 'Georg', 'i', 'a', '__eow']

Word ngram size 1: 2 times

Char ngram size 2: 17 times
Char ngram size 3: 6 times
Char ngram size 4: 3 times
Char ngram size 5: 2 times

Word vocabulary - {'.': 0, 'Hello': 1}
Ngrams vocabulary - {'or': 2, 'ea': 3, 'in': 4, 'Ge': 5, 'eo': 6, 'rg': 7, 'Geo': 8, 'eor': 9, 'org': 10, 'Geor': 11, 'eorg': 12, 'Georg': 13, 'as': 14, 'it': 15, 'id': 16, 'wo': 17, 'om': 18, 'ar': 19, 'ear': 20, 'ge': 21, 'rge': 22, 'orge': 23, 'eorge': 24, 'wh': 25, 'ha': 26, 'has': 27, 'ai': 28, 'de': 29}


## no word n-grams, character n-grams

In [20]:
encoder = Encoder()
encoder.set_params(tokenize_symbols=True, vocab_size=20, pct_bpe=1, word_ngram_min=2, word_ngram_max=5)
encoder.fit(train_text)
text = "Say Hello to guys from Georgia"
print(encoder.transform(text))

print()

for k, v in count_word_ngrams(encoder.word_vocab).items():
    print("Word ngram size {}: {} times".format(k, v))
    
print()

for k, v in count_char_ngrams(encoder.bpe_vocab).items():
    print("Char ngram size {}: {} times".format(k, v))
    
print()

print("Word vocabulary - {}".format(encoder.word_vocab))
print("Ngrams vocabulary - {}".format(encoder.bpe_vocab))

['__sow', 'S', 'a', 'y', '__eow', '__sow', 'H', 'el', 'lo', '__eow', '__sow', 't', 'o', '__eow', '__sow', 'g', 'u', 'y', 's', '__eow', '__sow', 'f', 'r', 'om', '__eow', '__sow', 'Ge', 'or', 'g', 'i', 'a', '__eow']


Char ngram size 2: 20 times

Word vocabulary - {}
Ngrams vocabulary - {'in': 0, 'ea': 1, 'id': 2, 'it': 3, 'or': 4, 'as': 5, 'ar': 6, 'ai': 7, 'de': 8, 'Ge': 9, 'eo': 10, 'rg': 11, 'el': 12, 'll': 13, 'lo': 14, 'wo': 15, 'om': 16, 'wh': 17, 'ha': 18, 'rs': 19}
