In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%autosave 20

Autosaving every 20 seconds


### Odiya Tokenizer

In [2]:
import requests
from bs4 import BeautifulSoup

### Scraping data from Website

### Cleaning and Preprocessing data

In [3]:
import re

In [4]:
# Read the content of the file
with open('data.txt', 'r') as file:
    text = file.read()
    
print (f"Raw text length :{len(text)}")
# Remove all characters except for periods
cleaned_text = re.sub(r'[A-Za-z0-9]|://|\.{2}|[?\[\]]|\.{2}\/\/-+|[\/-]|:|\.{3,}', '', text)

print (f"Cleaned text length :{len(cleaned_text)}")

# Write the cleaned content to a new file
with open('cleaned_data.txt', 'w') as file:
    file.write(cleaned_text)

print("File cleaned successfully.")

Raw text length :158303
Cleaned text length :148984


148984

File cleaned successfully.


### Building Tokenizer

In [5]:
from collections import Counter
from tqdm import tqdm
import numpy as np
import pickle

In [6]:
class Tokenizer_BPE:
    def __init__(self, text, vocab_size=5000):
        self.max_vocab_size = vocab_size
        self.corpus = text

    def _get_token_stats(self, ids):
        """Compute frequency of adjacent token pairs."""
        return Counter(zip(ids, ids[1:]))

    def _merge_tokens(self, ids, pair, new_idx):
        """Merge occurrences of the most frequent pair."""
        i = 0
        merged_ids = []
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                merged_ids.append(new_idx)
                i += 2
            else:
                merged_ids.append(ids[i])
                i += 1
        return merged_ids

    def train(self):
        """Train the Byte Pair Encoding tokenizer."""
        self.vocab = {i: bytes([i]) for i in range(256)}
        self.merges = {}
        num_merges = self.max_vocab_size - 256

        # Encode the corpus into byte tokens.
        ids = list(self.corpus.encode("utf-8"))
        tokens = ids.copy()
        pbar = tqdm(range(num_merges), desc="Training BPE Tokenizer")
        for _ in pbar:
            stats = self._get_token_stats(ids)
            if not stats:
                break

            # Select the most frequent pair.
            most_frequent_pair = max(stats, key=stats.get)

            # Assign a new index to the pair and merge.
            new_idx = len(self.vocab)
            ids = self._merge_tokens(ids, most_frequent_pair, new_idx)

            # Update vocab and merges.
            self.merges[most_frequent_pair] = new_idx
            self.vocab[new_idx] = self.vocab[most_frequent_pair[0]] + self.vocab[most_frequent_pair[1]]
            pbar.set_description(f"Iteration {_}, Compression Ratio {len(tokens) / len(ids):.2f}X")
            if _ in np.arange( 0,num_merges,num_merges/1000):
                print(f"Crossed {len(tokens) / len(ids):.2f} ")
                
        print("++++++++++++++++++++++++++++Final Result ++++++++++++++++++++++++++++")
        print(f"After training: tokens length: {len(ids)}")
        print(f"After training: merges length: {len(self.merges)}")
        print(f"After Training Vocab length {len(self.vocab)}")
        print(f"compression ratio: {len(tokens) / len(ids):.2f}X")

        return self.vocab, self.merges

    def encode(self, text):
        """Encode text into BPE tokens."""
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = self._get_token_stats(tokens)
            if not stats:
                break

            # Find the next pair to merge.
            pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
            if pair not in self.merges:
                break

            new_idx = self.merges[pair]
            tokens = self._merge_tokens(tokens, pair, new_idx)

        return tokens

    def decode(self, ids):
        """Decode BPE tokens back to text."""
        tokens = b"".join(self.vocab[idx] for idx in ids)
        return tokens.decode("utf-8", errors="replace")

    def save(self, filepath):
        """Save the tokenizer to a file."""
        with open(filepath, 'wb') as f:
            pickle.dump({"vocab": self.vocab, "merges": self.merges}, f)
        print(f"Tokenizer saved to {filepath}")

    @staticmethod
    def load(filepath):
        """Load the tokenizer from a file."""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        tokenizer = Tokenizer_BPE("")
        tokenizer.vocab = data["vocab"]
        tokenizer.merges = data["merges"]
        return tokenizer

In [7]:
# Example usage
if __name__ == "__main__":
    with open('cleaned_data.txt', 'r') as file:
        data = file.read()
    example_text = data[:200]
    
    max_vocab_size = 5000
    odiya_tokenizer = Tokenizer_BPE(data,max_vocab_size)
    vocab, merges = odiya_tokenizer.train()
    odiya_tokenizer.save("odiya_tokenizer.pkl")

    loaded_tokenizer = Tokenizer_BPE.load("odiya_tokenizer.pkl")
    print(f"Odiys Text Example : {example_text}")
    encoded = loaded_tokenizer.encode(example_text)
    decoded = loaded_tokenizer.decode(encoded)
    print(f"Encoded Text: {encoded} \n")
    print(f"Decoded the Tokens: {decoded} \n")
    
    if decoded == example_text:
        print("Tokenizer Working Fine !!  <3 \n")
    else:
        print("Tokenizer Not Working Well. Please Chcek the Steps \n")
        


Iteration 1, Compression Ratio 1.44X:   0%|                  | 2/4744 [00:00<03:59, 19.78it/s]

Crossed 1.32 


Iteration 609, Compression Ratio 5.56X:  13%|█▊            | 604/4744 [00:10<00:57, 71.67it/s]

Crossed 5.52 


Iteration 1202, Compression Ratio 6.77X:  25%|███         | 1198/4744 [00:18<00:43, 82.01it/s]

Crossed 6.74 


Iteration 1797, Compression Ratio 7.60X:  38%|████▌       | 1792/4744 [00:25<00:34, 85.40it/s]

Crossed 7.57 


Iteration 2391, Compression Ratio 8.27X:  50%|██████      | 2384/4744 [00:32<00:26, 89.85it/s]

Crossed 8.25 


Iteration 2986, Compression Ratio 8.89X:  63%|███████▌    | 2982/4744 [00:38<00:17, 98.54it/s]

Crossed 8.87 


Iteration 3581, Compression Ratio 9.43X:  75%|████████▎  | 3580/4744 [00:44<00:10, 107.24it/s]

Crossed 9.41 


Iteration 4175, Compression Ratio 9.97X:  88%|█████████▋ | 4168/4744 [00:50<00:05, 110.91it/s]

Crossed 9.96 


Iteration 4743, Compression Ratio 10.43X: 100%|███████████| 4744/4744 [00:54<00:00, 86.29it/s]


++++++++++++++++++++++++++++Final Result ++++++++++++++++++++++++++++
After training: tokens length: 37291
After training: merges length: 4744
After Training Vocab length 5000
compression ratio: 10.43X
Tokenizer saved to odiya_tokenizer.pkl
Odiys Text Example : ପ୍ରଥମ ବିଶ୍ୱ ଓଡିଆ ଭାଷା ସମ୍ମିଳନୀର ଐତିହାସିକ ଦିଗ  ଓଡିଆ ଭାଷାର ପ୍ରଚାର, ପ୍ରସାର ତଥା ଗବେଷଣାକୁ ପ୍ରାଧାନ୍ୟ ଦେବା ପ୍ରଥମ ବିଶ୍ୱ ଓଡିଆ ଭାଷା ସମ୍ମିଳନୀର ପ୍ରଧାନ ଲକ୍ଷ୍ୟ । ଏହି ସମ୍ମିଳନୀ ରାଜ୍ୟ, ଜାତୀୟ ଓ ଅନ୍ତର୍ଜାତୀୟ ସ୍ତରରେ ଆୟୋଜନ
Encoded Text: [4310, 3371, 310, 4312, 369, 830, 32, 1460, 281, 365, 2032, 1260, 3372, 2243, 151, 303, 1048, 304, 338, 467, 2919, 302, 1178, 1179, 759, 3371, 266, 1385, 4313, 1180, 362, 1339, 1633, 2506, 3373, 2920, 4314, 1300, 3375, 276] 

Decoded the Tokens: ପ୍ରଥମ ବିଶ୍ୱ ଓଡିଆ ଭାଷା ସମ୍ମିଳନୀର ଐତିହାସିକ ଦିଗ  ଓଡିଆ ଭାଷାର ପ୍ରଚାର, ପ୍ରସାର ତଥା ଗବେଷଣାକୁ ପ୍ରାଧାନ୍ୟ ଦେବା ପ୍ରଥମ ବିଶ୍ୱ ଓଡିଆ ଭାଷା ସମ୍ମିଳନୀର ପ୍ରଧାନ ଲକ୍ଷ୍ୟ । ଏହି ସମ୍ମିଳନୀ ରାଜ୍ୟ, ଜାତୀୟ ଓ ଅନ୍ତର୍ଜାତୀୟ ସ୍ତରରେ ଆୟୋଜନ 

Tokenizer Working Fine !!  <3 



### Making Gradio APP

##### Example App

In [8]:
import gradio as gr

# Define a function for your app
def reverse_text(input_text):
    return input_text[::-1]

# Create the Gradio interface
interface = gr.Interface(
    fn=reverse_text,         # Function to run
    inputs="text",           # Input component
    outputs="text",          # Output component
    title="Text Reverser",   # App title
    description="Enter text to reverse it.",  # App description
)

# Launch the app
interface.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




### Let's Build the Tokenizer app

In [9]:
from Odiya_Tokenizer import Tokenizer_BPE
import pickle
import gradio as gr

In [11]:
# Define the TextProcessor class
class TextProcessor:
    def __init__(self, tokenizer_path):
        self.loaded_tokenizer = Tokenizer_BPE.load(tokenizer_path)
        
    def encode(self, example_text):
        encoded_text = self.loaded_tokenizer.encode(example_text)
        return str(encoded_text)  # Convert to string for Gradio output

    def decode(self, encoded_text):
        decoded_text = self.loaded_tokenizer.decode(eval(encoded_text))  # Convert the input string back to a list
        return decoded_text

# Instantiate the TextProcessor with the tokenizer path
tokenizer = TextProcessor("odiya_tokenizer.pkl")

# Define the Gradio app layout
def beautify_app():
    
    with gr.Blocks(css="""
        #encode-header, #decode-header {
            font-size: 22px;
            font-weight: bold;
            color: #2D87D6;
            text-align: center;
        }
        #input-textbox, #token-input {
            border-radius: 10px;
            border: 2px solid #2D87D6;
            background-color: #E9F2FB;
            padding: 12px;
            margin-bottom: 10px;
            font-size: 16px;
            width: 100%;
        }
        #encoded-output, #decoded-output {
            border-radius: 10px;
            border: 2px solid #2D87D6;
            background-color: #E9F2FB;
            padding: 12px;
            font-size: 16px;
            width: 100%;
        }
        #encode-btn, #decode-btn {
            background-color: #2D87D6;
            color: white;
            font-weight: bold;
            border-radius: 12px;
            border: none;
            padding: 12px;
            width: 100%;
            font-size: 16px;
            transition: background-color 0.3s ease;
        }
        #encode-btn:hover, #decode-btn:hover {
            background-color: #1C6BB2;
        }
        .gr-button {
            margin-top: 15px;
        }
    """) as app:
        gr.Markdown(
        """
        <h1 style="text-align: center; font-size: 2.5em;">ଏହା ଏକ ଓଡିଆ ଟୋକେନାଇଜର୍ ଆପ୍| {This is a Odiya tokenizer app} Copy text in Encoder to see the Tokens.</h1>
        <p>Odiya Tokenizer (BPE Encoding and Decoding)</p>
        """,
        elem_id="title"
        )
        
        with gr.Row():
            # Left Column: Encode Text
            with gr.Column(scale=1, min_width=400):
                gr.Markdown("### **Encode Text**", elem_id="encode-header")
                input_text = gr.Textbox(
                    label="Enter Odiya Text", 
                    lines=10, 
                    placeholder="ଆମେ ସମସ୍ତେ ଭାରତୀୟ। କିନ୍ତୁ ଆମେ ପ୍ରଥମ ମଣିଷ |",
                    elem_id="input-textbox"
                )
                encode_button = gr.Button("Encode", elem_id="encode-btn")
                encoded_output = gr.Textbox(
                    label="Encoded Tokens", 
                    lines=10, 
                    interactive=False, 
                    placeholder="Encoded tokens will appear here.",
                    elem_id="encoded-output"
                )

            # Right Column: Decode Tokens
            with gr.Column(scale=1, min_width=400):
                gr.Markdown("### **Decode Tokens**", elem_id="decode-header")
                token_input = gr.Textbox(
                    label="Enter Encoded Tokens (comma-separated)", 
                    lines=10, 
                    placeholder="Example: [256, 474, 4786, 1501, 763, 607, 3672, 474, 4707, 300, 1858, 1326]",
                    elem_id="token-input"
                )
                decode_button = gr.Button("Decode", elem_id="decode-btn")
                decoded_output = gr.Textbox(
                    label="Decoded Text", 
                    lines=10, 
                    interactive=False, 
                    placeholder="Decoded text will appear here.",
                    elem_id="decoded-output"
                )

        # Function calls when buttons are clicked
        encode_button.click(fn=tokenizer.encode, inputs=input_text, outputs=encoded_output)
        decode_button.click(fn=tokenizer.decode, inputs=token_input, outputs=decoded_output)

    return app

# Running the app
app = beautify_app()
app.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


