### Jupyter Notebook Settings

In [3]:
from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))  
import IPython.display as display

  from IPython.core.display import display, HTML


### Libraries

In [None]:
from nltk.corpus import wordnet
import nltk
import string
import heapq
from collections import Counter
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock
from datasets import load_dataset
import re

In [None]:
# Load necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
wordnet.ensure_loaded()   # Ensure WordNet is fully loaded before starting any threads

### Data

In [None]:
# Load the dataset 
dataset = load_dataset("bookcorpus/bookcorpus", split='train')

### Build the Huffman Tree as binary
##### The reason for binary encoding is to save memory on the microcontroller

In [None]:
# Initialize lemmatizer and counters
lemmatizer = WordNetLemmatizer()
word_counter = Counter()
suffix_counter = Counter()
punctuation_counter = Counter()

# Define common suffixes and punctuation symbols
suffixes = ["ing", "ed", "es", "s", "ly", "er", "ion", "al", "able", "ness", "ful", "less", "ous", "ment", "ive", "ize", "en", "ity", "ant"]
punctuation_symbols = [" ", ".", ",", "!", "?", ";", ":", "-", "_", "(", ")", "[", "]", "{", "}", "\"", "'", "…", "@", "#", "$", "%", "^", "&", "*", "+", "=", "<", ">", "/", "\\", "|", "~", "`"]

# Lemmatization cache to speed up processing
lemmatization_cache = {}

# Locks for thread safety
counter_lock = Lock()
lemmatization_cache_lock = Lock()

def cached_lemmatize(word, lemmatizer):
    with lemmatization_cache_lock:
        if word in lemmatization_cache:
            return lemmatization_cache[word]
        else:
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatization_cache[word] = lemmatized_word
            return lemmatized_word

def process_document(document):
    local_word_counter = Counter()
    local_suffix_counter = Counter()
    local_punctuation_counter = Counter()

    tokens = nltk.word_tokenize(document.lower())
    for token in tokens:
        # Handle punctuation
        if token in string.punctuation or token in punctuation_symbols:
            local_punctuation_counter[token] += 1
        else:
            # Lemmatize using the cache
            root_word = cached_lemmatize(token, lemmatizer)
            local_word_counter[root_word] += 1

            # Check for suffixes
            for suffix in suffixes:
                if token.endswith(suffix):
                    local_suffix_counter[suffix] += 1
                    break

    # Update global counters with local counters using locks for thread safety
    with counter_lock:
        word_counter.update(local_word_counter)
        suffix_counter.update(local_suffix_counter)
        punctuation_counter.update(local_punctuation_counter)

# Process the dataset with a progress bar and multithreading
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(process_document, example) for example in dataset['text']]
    for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing dataset", unit="document"):
        pass

# Combine all counters into one for Huffman tree building
combined_counter = word_counter + suffix_counter + punctuation_counter

# Function to build Huffman tree and generate codes
def build_huffman_tree(frequency):
    heap = [[weight, [symbol, ""]] for symbol, weight in frequency.items()]
    heapq.heapify(heap)
    while len(heap) > 1:
        lo = heapq.heappop(heap)
        hi = heapq.heappop(heap)
        for pair in lo[1:]:
            pair[1] = '0' + pair[1]
        for pair in hi[1:]:
            pair[1] = '1' + pair[1]
        heapq.heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
    return sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))

# Generate Huffman codes for the combined frequencies
huffman_codes = build_huffman_tree(combined_counter)

### Check created codes

In [None]:
# Print the Huffman codes
for symbol, code in huffman_codes[:100]:
    print(f"{symbol}: {code}")

### Escape symbols and save codes to txt file

In [None]:
# Function to escape special characters for Arduino output
def escape_for_arduino(char):
    if char == "'":
        return "\\'"  # Escape single quote
    elif char == "\"":
        return "\\\""  # Escape double quote
    elif char == "\\":
        return "\\\\"  # Escape backslash
    elif char == "\n":
        return "\\n"  # Escape newline
    elif char == "\r":
        return "\\r"  # Escape carriage return
    elif char == "\t":
        return "\\t"  # Escape tab
    else:
        return char  # Return other characters unchanged

# Open a text file for writing the output
with open("huffman_codes_output.txt", "w") as file:
    # Step 5: Write Huffman table in Arduino format
    file.write("const HuffmanCode huffmanTable[] = {\n")

    for symbol, code in huffman_codes.items():
        # Escape special characters
        escaped_symbol = ''.join(escape_for_arduino(char) for char in symbol)
        # Write the entry to the file
        file.write(f"  {{\"{escaped_symbol}\", \"{code}\"}},\n")
    
    file.write("};\n")

# Confirmation message
print("Huffman codes have been saved to 'huffman_codes_output.txt'.")

### Clean Huffman Codes
##### Clean tokens and symbols

In [None]:
# Function to load Huffman codes from a text file and remove tokens containing symbols
def load_and_clean_huffman_codes(file_path):
    # Define the set of allowed characters (alphanumeric only)
    allowed_characters = re.compile(r'^[a-zA-Z0-9]+$')

    cleaned_huffman_codes = {}

    with open(file_path, 'r') as file:
        for line in file:
            if "{" in line and "}" in line:
                # Extract the key and value from the line
                key_value = line.strip()[1:-1].split(', ')
                key = key_value[0][1:-1]  # remove the surrounding quotes from the key
                value = key_value[1][1:-1]  # remove the surrounding quotes from the value

                # Check if the token contains only alphanumeric characters
                cleaned_key = key.replace('\\\'', '\'').replace('\\"', '"').replace('\\\\', '\\')
                if allowed_characters.match(cleaned_key):
                    # Store the cleaned key and value in the dictionary if it passes the check
                    cleaned_huffman_codes[cleaned_key] = value.strip('"')
    
    return cleaned_huffman_codes

# Specify the file path to the Huffman codes file
huffman_codes_file = "huffman_codes_output.txt"

# Load and clean the Huffman codes
cleaned_huffman_codes = load_and_clean_huffman_codes(huffman_codes_file)

# Optionally, save the cleaned Huffman codes back to a file
with open("cleaned_huffman_codes_output.txt", "w") as file:
    for key, value in cleaned_huffman_codes.items():
        file.write(f'{{"{key}", "{value}"}},\n')

print("Cleaned Huffman codes have been saved to 'cleaned_huffman_codes_output.txt'.")

### Test the Huffman Encoding on random phrases

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to load Huffman codes from a text file
def load_huffman_codes_from_file(file_path):
    huffman_codes = {}
    with open(file_path, 'r') as file:
        for line in file:
            if "{" in line and "}" in line:
                key_value = line.strip()[1:-1].split(', ')
                key = key_value[0][1:-1]  # remove the surrounding quotes from the key
                value = key_value[1][1:-1]  # remove the surrounding quotes from the value
                
                # Remove any extra characters that may have been escaped in the file
                key = key.replace('\\\'', '\'').replace('\\"', '"').replace('\\\\', '\\')
                value = value.replace('\\\'', '\'').replace('\\"', '"').replace('\\\\', '\\')

                # Store the cleaned key and value in the dictionary
                huffman_codes[key] = value.strip('"')  # Ensure the value does not contain any quotes
    return huffman_codes

# Function to preprocess and clean the phrase, keeping ASCII characters and symbols
def preprocess_phrase(phrase):
    # Convert to lowercase
    phrase = phrase.lower()
    
    # Add spaces around punctuation and symbols
    phrase = re.sub(r'([.,!?;:(){}[\]"\'#])', r' \1 ', phrase)
    
    # Remove non-ASCII characters
    phrase = re.sub(r'[^\x00-\x7F]', '', phrase)
    
    # Remove extra spaces
    phrase = re.sub(r'\s+', ' ', phrase).strip()
    
    return phrase

# Function to lemmatize and split words into root and suffix
def lemmatize_and_split(word):
    if word.endswith("'s"):
        root_word = lemmatizer.lemmatize(word[:-2])
        return root_word, "'s"
    else:
        root_word = lemmatizer.lemmatize(word)
        suffix = word[len(root_word):] if len(word) > len(root_word) else ''
        return root_word, suffix

# Function to encode a phrase using Huffman codes and return an array of codes
def encode_phrase(phrase, huffman_codes):
    encoded = []
    tokens = phrase.split()
    for token in tokens:
        root_word, suffix = lemmatize_and_split(token)
        
        if root_word in huffman_codes:
            encoded.append(huffman_codes[root_word])
        else:
            raise ValueError(f"Root word '{root_word}' not found in Huffman codes.")
        
        if suffix and suffix in huffman_codes:
            encoded.append(huffman_codes[suffix])
        elif suffix:
            raise ValueError(f"Suffix '{suffix}' not found in Huffman codes.")
    
    return encoded

# Function to decode an array of encoded strings using Huffman codes
def decode_phrase(encoded, huffman_codes):
    inverse_huffman_codes = {v: k for k, v in huffman_codes.items()}
    decoded = []
    for code in encoded:
        if code in inverse_huffman_codes:
            decoded.append(inverse_huffman_codes[code])
        else:
            raise ValueError(f"Code '{code}' not found in Huffman codes.")
    
    # Join decoded parts and remove spaces around punctuation
    decoded_phrase = ' '.join(decoded)
    decoded_phrase = re.sub(r'\s+([.,!?;:(){}[\]"\'#])', r'\1', decoded_phrase)
    return decoded_phrase.strip()

# Function to pack bit strings into a byte array
def pack_bits(bit_strings):
    packed_bytes = []
    current_byte = 0
    bits_filled = 0

    for bit_string in bit_strings:
        for bit in bit_string:
            current_byte = (current_byte << 1) | int(bit)
            bits_filled += 1
            if bits_filled == 8:
                packed_bytes.append(current_byte)
                current_byte = 0
                bits_filled = 0

    if bits_filled > 0:
        packed_bytes.append(current_byte << (8 - bits_filled))

    return packed_bytes

# Function to unpack a byte array back into a bit string
def unpack_bits(packed_bytes, total_bits):
    current_string = ""

    for byte in packed_bytes:
        for i in range(7, -1, -1):
            bit = (byte >> i) & 1
            current_string += str(bit)
            total_bits -= 1
            if total_bits == 0:
                break
        if total_bits == 0:
            break

    return current_string

# Choose which Huffman codes to use (from file or in-memory variable)
use_file = True  # Set to True to use Huffman codes from file

if use_file:
    huffman_codes_file = "huffman_codes_output.txt"
    huffman_codes = load_huffman_codes_from_file(huffman_codes_file)
else:
    huffman_codes = huffman_codes  # If you have in-memory codes

# Example usage
phrase_to_encode = """
        Most rockets can be launched from the ground because exhaust thrust 
        from the engine is bigger than the weight of the vehicle on Earth. 
        Some are used to bring satellites into orbit, for example from a spaceport.
        Some rockets such as ion thrusters are too wering them to outer space.
"""

# Preprocess the phrase
processed_phrase = preprocess_phrase(phrase_to_encode)
print(f"Original Phrase: {phrase_to_encode}")
print(f"Processed Phrase: {processed_phrase}")

# Encode the cleaned phrase
encoded_phrase = encode_phrase(processed_phrase, huffman_codes)
print(f"Encoded Phrase: {encoded_phrase}")

# Pack the encoded phrase into a byte array
packed_encoding = pack_bits(encoded_phrase)
print(f"Packed Encoding: {packed_encoding}")

# Unpack the byte array back into a bit string
unpacked_encoding = unpack_bits(packed_encoding, sum(len(b) for b in encoded_phrase))
print(f"Unpacked Encoding: {unpacked_encoding}")

# Properly decode the phrase by extracting the correct bit lengths
bit_index = 0
decoded_segments = []
for token in processed_phrase.split():
    root_word, suffix = lemmatize_and_split(token)
    if root_word in huffman_codes:
        bit_length = len(huffman_codes[root_word])
        decoded_segments.append(unpacked_encoding[bit_index:bit_index + bit_length])
        bit_index += bit_length
    
    if suffix in huffman_codes:
        bit_length = len(huffman_codes[suffix])
        decoded_segments.append(unpacked_encoding[bit_index:bit_index + bit_length])
        bit_index += bit_length

# Decode the unpacked bit strings back into the original phrase
decoded_phrase = decode_phrase(decoded_segments, huffman_codes)
print(f"Decoded Phrase: {decoded_phrase}")