In [356]:
import heapq 
import sys

TASK FROM UDACITY
Take a string and determine the relevant frequencies of the characters.
Build and sort a list of tuples from lowest to highest frequencies.
Build the Huffman Tree by assigning a binary code to each letter, using shorter codes for the more frequent letters. (This is the heart of the Huffman algorithm.)
Trim the Huffman Tree (remove the frequencies from the previously built tree).
Encode the text into its compressed form.
Decode the text from its compressed form.
                     

In [357]:
#take a string and determine the relevant frequencies of the characters
# we will use dic, then we can access the value and the letter

def frequency_dict(nachricht):
    freqency = dict()
    for char in nachricht:
        if freqency.get(char):
            freqency[char] += 1
        else:
            freqency[char] = 1
    return freqency




In [358]:
#Build the Huffman Tree by assigning a binary code to each letter,
#using shorter codes for the more frequent letters.
#(This is the heart of the Huffman algorithm.)
#https://towardsdatascience.com/data-structure-heap-23d4c78a6962
#https://www.geeksforgeeks.org/heap-queue-or-heapq-in-python/
#we use heap, as this is a priority queue

def install_tree(freq):    
    heap = [] # issue the heap
# heappop(heap) :- This function is used to remove and return the smallest element from heap. 
#The order is adjusted, so as heap structure is maintained.
    for bs in freq: heapq.heappush(heap, [bs])
    while (len(heap) > 1):
         left = heapq.heappop(heap) 
         right = heapq.heappop(heap)
         freq0, label0 = left[0]    
         freq1, label1 = right[0]       
         node = [(freq0 + freq1, label0 + label1), left, right]
         heapq.heappush(heap, node) # using heappush() to push elements into heap 
    return heap.pop()



In [359]:
#we make a map of the heap, to add 0 and 1 to our tree. as more often a letter is used, 
#as shorter the code, as more above the letter is in our tree. 

#https://www.programiz.com/python-programming/methods/built-in/map
def issue_map(tree, map = dict(), code =''): # code later will be 0 or 1
    if (len(tree) == 1):
        label, freq = tree[0]
        map[label] = code
    else:
        value, left, right = tree
        issue_map(left, map, code + "0")
        issue_map(right, map, code + "1")
        
    return map


In [360]:
# write a function that will encode the given message 
def encode(nachricht):
    tree = install_tree(frequency_dict(nachricht))
    map = issue_map(tree)
    data = ''.join([ map[letter] for letter in nachricht])
    return data, tree

#test case 2
encode("ente")

('0001100', [('ent', 4), [('en', 3), [('e', 2)], [('n', 1)]], [('t', 1)]])

In [361]:
def decode(data, tree):
    Baum = tree
    decoded = []

    for bit in data:
        if (bit == '0'):
            Baum = Baum[1]
        else:
            Baum = Baum[2]

        if (len(Baum) == 1):
            label, freq = Baum[0]
            decoded.append(label)
            Baum = tree

    return ''.join(decoded)



In [362]:
# test cases provided by udacity

if __name__ == "__main__":
    codes = {}
  
    viertersatz = "e88"

    print ("The size of the data is: {}\n".format(sys.getsizeof(viertersatz)))
    print ("The content of the data is: {}\n".format(viertersatz))

    encoded_data, tree = encode(viertersatz)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = decode(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the decoded data is: {}\n".format(decoded_data))
    print ("____end of this test case_____")
    


The size of the data is: 52

The content of the data is: e88

The size of the encoded data is: 28

The content of the encoded data is: 100

The size of the decoded data is: 52

The content of the decoded data is: e88

____end of this test case_____


In [363]:
# test cases provided by udacity

if __name__ == "__main__":
    codes = {}
  
    viertersatz = "the bird is the word"

    print ("The size of the data is: {}\n".format(sys.getsizeof(viertersatz)))
    print ("The content of the data is: {}\n".format(viertersatz))

    encoded_data, tree = encode(viertersatz)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = decode(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the decoded data is: {}\n".format(decoded_data))
    print ("____end of this test case_____")

The size of the data is: 69

The content of the data is: the bird is the word

The size of the encoded data is: 44

The content of the encoded data is: 01000000100000001000000000000000000010000010001000000001000000000000000100100000000000100000010000000100000000001000010001000000001

The size of the decoded data is: 69

The content of the decoded data is: the bird is the word

____end of this test case_____


In [364]:
# test cases provided by udacity

if __name__ == "__main__":
    codes = {}
  
    viertersatz = "es ist nicht leicht, aber leicht hat es einen."

    print ("The size of the data is: {}\n".format(sys.getsizeof(viertersatz)))
    print ("The content of the data is: {}\n".format(viertersatz))

    encoded_data, tree = encode(viertersatz)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = decode(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the decoded data is: {}\n".format(decoded_data))
    print ("____end of this test case_____")

The size of the data is: 95

The content of the data is: es ist nicht leicht, aber leicht hat es einen.

The size of the encoded data is: 72

The content of the encoded data is: 00000001010000000000000000001011000000000000000010000010000000010000001100000000000000000100000001000001000000001000000110000000000001000000000000000000000001000000000100000001001000000000000000001000000010000010000000010000001100000000000000000001000000000011000000000000000000001010000000000000000000010000010001000000010001000000000001

The size of the decoded data is: 95

The content of the decoded data is: es ist nicht leicht, aber leicht hat es einen.

____end of this test case_____


In [365]:

if __name__ == "__main__":
    codes = {}
  
    viertersatz = "t."

    print ("The size of the data is: {}\n".format(sys.getsizeof(viertersatz)))
    print ("The content of the data is: {}\n".format(viertersatz))

    encoded_data, tree = encode(viertersatz)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = decode(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the decoded data is: {}\n".format(decoded_data))
    print ("____end of this test case_____")

The size of the data is: 51

The content of the data is: t.

The size of the encoded data is: 28

The content of the encoded data is: 10

The size of the decoded data is: 51

The content of the decoded data is: t.

____end of this test case_____


In [366]:
# test case 
if __name__ == "__main__":
    codes = {}
  
    viertersatz = "zzzzzzzzzu"

    print ("The size of the data is: {}\n".format(sys.getsizeof(viertersatz)))
    print ("The content of the data is: {}\n".format(viertersatz))

    encoded_data, tree = encode(viertersatz)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = decode(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the decoded data is: {}\n".format(decoded_data))
    print ("____end of this test case_____")

The size of the data is: 59

The content of the data is: zzzzzzzzzu

The size of the encoded data is: 28

The content of the encoded data is: 1111111110

The size of the decoded data is: 59

The content of the decoded data is: zzzzzzzzzu

____end of this test case_____
