In [1]:
!pip install -qU langchain-community wikipedia

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m490.2/490.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.0/342.0 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xmanager 0.7.1 requires sqlalchemy==1.2.19, but you have sqlalchemy 2.0.46 which is incompatible.

In [2]:
import re
from collections import Counter
from langchain_community.retrievers import WikipediaRetriever

retriever = WikipediaRetriever()

In [3]:
def ExtractContentFromWikipedia(query="Nigerian History"):
    txt = ""
    for docs in retriever.invoke("Nigerian History"):
        page_content = docs.page_content
        txt+= " " + page_content
    return txt.lower()

extracted_txt = ExtractContentFromWikipedia()

In [4]:
init_word_list = re.findall(pattern=r"\w+|[^\s\w]+", string=extracted_txt, flags=re.MULTILINE)

init_word_list[:5]

['the', 'history', 'of', 'nigeria', 'can']

In [5]:
word_count = Counter(init_word_list) # Returns a dictionary of word and its counts

word_count.most_common(10)

[(',', 142),
 ('the', 126),
 ('.', 114),
 ('of', 83),
 ('in', 68),
 ('and', 62),
 ('-', 47),
 ('nigeria', 32),
 ('to', 29),
 ('west', 29)]

In [6]:
eos_token = "</end>"

current_wrd_split = {}
for word in word_count:
    wrd_split = list(word)+[eos_token]
    current_wrd_split[word] = wrd_split

#### OUTPUT #### 
# {'beginning': ['b', 'e', 'g', 'i', 'n', 'n', 'i', 'n', 'g', '</w>']}

In [7]:
unique_token = set()
for word in current_wrd_split:
    wrd_lst = current_wrd_split[word]
    unique_token.update(wrd_lst)

#### OUTPUT ####
# Initial vocabulary created with 56 unique symbols.
# Initial vocabulary symbols: ['"','&',"'",'(',')','+', ',','-','.','/',
# '0','1','2','3','4','5','6','7','8','9',':',';','</end>','=','a','b',
# 'c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t',
# 'u','v','w','x','y','z','é','ü','̀','ẹ','–','—']

In [8]:
def TrainTokenizer(num_merges=100000):
    num_epochs = num_merges 
    learned_merges = {}

    unique_token_copy = unique_token.copy()
    current_wrd_split_copy = current_wrd_split.copy()

    pair_count = Counter()
    for epoch in range(num_epochs):
        # print(f"Epoch {epoch+1}/{num_epochs}")
        for token, freq in word_count.items():
            wrd_split = current_wrd_split_copy[token]
            for idx in range(1, len(wrd_split)):
                char_pair = (wrd_split[idx-1], wrd_split[idx])
                pair_count[char_pair] += freq
        if not pair_count:
            print("No more pairs found to merge. Stopping early.")
            break

        best_pair = max(pair_count, key=pair_count.get)
        # print(f"Found best pair: {best_pair} with frequency {pair_count[best_pair]}")

        learned_merges[best_pair] = epoch

        new_token = "".join(best_pair)

        new_corpus_split = {}

        for token in current_wrd_split:
            old_wrd_split = current_wrd_split_copy[token]
            new_word_split = []
            k = 0
            while k < len(old_wrd_split):
                if (k < len(old_wrd_split) - 1) and ((old_wrd_split[k], old_wrd_split[k+1]) == best_pair):
                    new_word_split.append(new_token)
                    k+=2
                else:
                    new_word_split.append(old_wrd_split[k])
                    k+=1
                    
            new_corpus_split[token] = new_word_split

        current_wrd_split_copy = new_corpus_split

        unique_token_copy.add(new_token)

    return unique_token_copy, learned_merges, current_wrd_split_copy

In [9]:
final_unique_token, final_learned_merges, final_wrd_split = TrainTokenizer()

In [10]:
# final_unique_token, final_learned_merges, final_wrd_split

In [11]:
def apply_bpe(word, learned_merges):
    symbols = list(word)

    # apply merges in learned order
    for (a, b), _ in sorted(learned_merges.items(), key=lambda x: x[1]):
        i = 0
        new_symbols = []

        while i < len(symbols):
            if i < len(symbols) - 1 and symbols[i] == a and symbols[i+1] == b:
                new_symbols.append(a + b)
                i += 2
            else:
                new_symbols.append(symbols[i])
                i += 1

        symbols = new_symbols

    return symbols

def encode_sentence(sentence, learned_merges):
    encoded = []
    for word in sentence.split():
        encoded.append(apply_bpe(word, learned_merges))
    return encoded

In [12]:
test_sentence = "lower newest wider"
encoded = encode_sentence(test_sentence, final_learned_merges)

print("\nEncoded sentence:")
print(encoded)


Encoded sentence:
[['low', 'er'], ['ne', 'west'], ['wi', 'der']]
