# 3.Huffman Coding
A Huffman code is a type of optimal prefix code that is used for compressing data. The Huffman encoding and decoding schema is also lossless, meaning that when compressing the data to make it smaller, there is no loss of information.

The Huffman algorithm works by assigning codes that correspond to the relative frequency of each character for each character. The Huffman code can be of any length and does not require a prefix; therefore, this binary code can be visualized on a binary tree with each encoded character being stored on leafs.

There are many types of pseudocode for this algorithm. At the basic core, it is comprised of building a Huffman tree, encoding the data, and, lastly, decoding the data.

Here is one type of pseudocode for this coding schema:

Take a string and determine the relevant frequencies of the characters.
Build and sort a list of tuples from lowest to highest frequencies.
Build the Huffman Tree by assigning a binary code to each letter, using shorter codes for the more frequent letters. (This is the heart of the Huffman algorithm.)
Trim the Huffman Tree (remove the frequencies from the previously built tree).
Encode the text into its compressed form.
Decode the text from its compressed form.
You then will need to create encoding, decoding, and sizing schemas.

In [13]:
class Tree_node:
    def __init__(self,value,left=None,right=None,name=None):
        self.value=value
        self.left=left
        self.right=right
        self.name=name

class Queue_node:
    def __init__(self, value):
        self.value=value
        self.next=None

In [14]:
class Queue:
    def __init__(self):
        self.head=None
        self.tail=None
    
    def add(self,node):
        if self.head==None:
            self.head=node
            self.tail=node
            return
        elif self.head.value.value>=node.value.value:
            node.next=self.head
            self.head=node
            return
        elif self.tail.value.value<=node.value.value:
            self.tail.next=node
            self.tail=self.tail.next
            return
        else:
            pointer=self.head
            while pointer.next:
                if node.value.value<pointer.next.value.value:
                    temp=pointer.next
                    pointer.next=node
                    node.next=temp
                    return
                pointer=pointer.next
    def pop(self):
        if self.head==None:
            return None
        else:
            node=self.head
            self.head=self.head.next
            return node

In [15]:
from collections import defaultdict

def count_freq(string):
    dict_=defaultdict(int)
    for char in string:
        dict_[char]+=1
    return dict_

In [16]:
def tree_generator(data):
    dict1=count_freq(data)    
    queue=Queue()
    for key,value in dict1.items():
        queue.add(Queue_node(Tree_node(value=value,name=key)))
    while queue.head:
        left=queue.pop()
        right=queue.pop()
        if right==None:
            tree_headnode=left.value
            break
        else:
            combine_value=left.value.value+right.value.value
            combine_tree_node=Tree_node(combine_value,left.value,right.value)
            queue.add(Queue_node(combine_tree_node))
    if len(dict1)==1:
        tree_headnode.left=Tree_node(value=None,name=tree_headnode.name)
        return tree_headnode
    else:
        return tree_headnode

In [17]:
def encoder(root):
    def construct_paths(root,path=''):
        if root:
            if root.left==None and root.right==None:
                paths[root.name]=path
            else:
                construct_paths(root.left,path+'0')
                construct_paths(root.right,path+'1')
    paths=dict()
    construct_paths(root,'')
    return paths

In [18]:
def huffman_encoding(data):
    encoded_data=''
    root=tree_generator(data)
    code_dict=encoder(root)
    for i in data:
        encoded_data+=code_dict[i]
    return encoded_data, root

In [19]:
import sys
def huffman_decoding(data,tree):
    decoded_data=''
    node=tree
    while len(data)>0:
        word=data[0]
        data=data[1:]
        if word=='0':
            node=node.left
            if node.name!=None:
                decoded_data+=node.name
                node=tree
        else:
            node=node.right
            if node.name!=None:
                decoded_data+=node.name
                node=tree
    return decoded_data

In [20]:
if __name__ == "__main__":
    codes = {}

    a_great_sentence = "The bird is the word"

    print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence)))
    print ("The content of the data is: {}\n".format(a_great_sentence))

    encoded_data, tree = huffman_encoding(a_great_sentence)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = huffman_decoding(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the encoded data is: {}\n".format(decoded_data))

The size of the data is: 69

The content of the data is: The bird is the word

The size of the encoded data is: 36

The content of the encoded data is: 0001001010111000001110010111101111011111110100010101111100111000100101

The size of the decoded data is: 69

The content of the encoded data is: The bird is the word



In [21]:
print('Test2')

a_great_sentence = "ABC"

print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence))) # Should return 69
print ("The content of the data is: {}\n".format(a_great_sentence)) # Should return A

encoded_data, tree = huffman_encoding(a_great_sentence)

print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2)))) # Should return 36
print ("The content of the encoded data is: {}\n".format(encoded_data)) # Should return 0001001010111000001110010111101111011111110100010101111100111000100101

decoded_data = huffman_decoding(encoded_data, tree)

print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data))) # Should return 69
print ("The content of the encoded data is: {}\n".format(decoded_data)) # Should return A
print('Test2 done.')
print('---------------------------------------------------------------------------')


Test2
The size of the data is: 52

The content of the data is: ABC

The size of the encoded data is: 28

The content of the encoded data is: 01110

The size of the decoded data is: 52

The content of the encoded data is: ABC

Test2 done.
---------------------------------------------------------------------------


In [24]:
#Test3
print('Test3')

a_great_sentence = "A"

print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence))) # Should return 52
print ("The content of the data is: {}\n".format(a_great_sentence)) # Should return ABC

encoded_data, tree = huffman_encoding(a_great_sentence)

print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2)))) # Should return 28
print ("The content of the encoded data is: {}\n".format(encoded_data)) # Should return 01110

decoded_data = huffman_decoding(encoded_data, tree)

print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data))) # Should return 52
print ("The content of the encoded data is: {}\n".format(decoded_data)) # Should return ABC
print('Test3 done.')
print('---------------------------------------------------------------------------')

Test3
The size of the data is: 50

The content of the data is: A

The size of the encoded data is: 24

The content of the encoded data is: 0

The size of the decoded data is: 50

The content of the encoded data is: A

Test3 done.
---------------------------------------------------------------------------
