In [12]:
from passlib.apps import custom_app_context as pwd_context

In [38]:
import heapq
import os

class BinaryTreeNode:
    def __init__(self, value, freq):
        self.value = value
        self.freq = freq
        self.left = None
        self.right = None
    
    # over load funtion -> to overwrite these funtionalities
    def __lt__(self, other): #less than
        return self.freq < other.freq
    
    def __eq__(self, other): # equal
        return self.freq == other.freq
        
class HuffmanCoding:
    
    def __init__(self,path):
        self.path = path
        self.__heap = [ ]
        self.__codes = {}
        self.__reverseCodes = {}
    # private function
    def __make_frequency_dict(self,text): 
        freq_dict = {}
        for char in text:
            if char not in freq_dict:
                freq_dict[char] = 0
            freq_dict[char] += 1
        return freq_dict
    
    def __buildHeap(self, freq_dict):
        for key in freq_dict:
            freq = freq_dict[key]
            binary_tree_node = BinaryTreeNode(key, freq)
            heapq.heappush(self.__heap,binary_tree_node)
        
    def __buildTree(self):
        while len(self.__heap) > 1:
            binary_tree_node_1 = heapq.heappop(self.__heap)
            binary_tree_node_2 = heapq.heappop(self.__heap)
            freq_sum = binary_tree_node_1.freq + binary_tree_node_2.freq
            newNode = BinaryTreeNode(None,freq_sum)
            newNode.left = binary_tree_node_1
            newNode.right = binary_tree_node_2
            heapq.heappush(self.__heap,newNode) 
        return
    
    def __buildCodesHelper(self, root, curr_bits):
        if root is None:
            return
        if root.value is not None:
            self.__codes[root.value] = curr_bits
            self.__reverseCodes[curr_bits] = root.value
            return
        
        self.__buildCodesHelper(root.left, curr_bits+"0")
        self.__buildCodesHelper(root.right, curr_bits+"1")
        
    def __buildCodes(self):

        root = heapq.heappop(self.__heap)
        self.__buildCodesHelper (root,'')
    
    def __getEncodedText(self, text):
        encoded_text = ""
        for char in text:
            encoded_text += self.__codes[char]
        return encoded_text
    
    def __getPaddedEncodedText(self, encoded_text):
        
        padded_amount = 8-(len(encoded_text)%8)
        for i in range(padded_amount):
            encoded_text += '0'
        padded_info = "{0:08b}".format(padded_amount)
        padded_encoded_text = padded_info + encoded_text
        return padded_encoded_text
    
    def __getBytesArray(self, padded_encoded_text):
        array = []
        for i in range(0 , len(padded_encoded_text), 8):
            byte = padded_encoded_text[i:i+8]
            array.append(int(byte,2))
            
        return array
        
    def compress(self):
        
        '''
        steps:
        1- get file from path.
        2- read text from file.
        3- make frequency dictionary using the text.
        4- construct the heap from the frequency count to get two min nodes.
        5- contruct the binary tree from the heap.
        6- contruct the codes from binary tree.
        7- creating the encoded text using the codes.
        8- put this encoded text into binary file.
        9- return  this binary file.
        '''
        # step1
        file_name, file_extension = os.path.splitext(self.path)
        output_path = file_name + ".bin"
        
        with open(self.path, 'r+') as file, open(output_path, 'wb') as output:
            text = file.read()
            text = text.rstrip()
            
            # step2
            freq_dict = self.__make_frequency_dict(text)

            # step2
            self.__buildHeap(freq_dict)

            #step3
            self.__buildTree()

            #step4
            self.__buildCodes()

            #step5
            encoded_text = self.__getEncodedText(text)

            #pad this encoded text
            padded_encoded_text = self.__getPaddedEncodedText(encoded_text)

            bytes_array = self.__getBytesArray(padded_encoded_text)

            #  return this file as output
            final_bytes = bytes(bytes_array)
            print(final_bytes)
            output.write(final_bytes)
            
        print("Compressed Succesfully")
        return output_path
    
    def __removePadding(self,text):
        
        padded_info = text[:8]
        extra_padding = int(padded_info,2)
        
        text = text[8:]
        text_after_padding_removed = text[:-1*extra_padding]
        return text_after_padding_removed
    
    def __decodeText(self, text):
        
        decoded_text = ""
        current_bits = ""
        
        for bit in text:
            current_bits += bit
            if current_bits in self.__reverseCodes:
                character = self.__reverseCodes[current_bits]
                decoded_text += character
                current_bits = ""
                
        return decoded_text
    
    def decompress(self,input_path):
        filename, file_extension = os.path.splitext(self.path)
        output_path = filename + "_decompressed" + ".txt"
        with open(input_path, "rb") as file, open(output_path,'w') as output:
            bit_string = ""
            byte = file.read(1)
            while byte:
                byte = ord(byte) # will return 00x to int
                bits = bin(byte)[2:].rjust(8,'0') # will return int to 001010
                bit_string += bits
                byte = file.read(1)
            
            actual_text = self.__removePadding(bit_string)
            decompressed_text = self.__decodeText(actual_text)
            output.write(decompressed_text)
        print("Decompressed Succesfully")
        print("Out path {}".format(output_path))

In [39]:
path = "test_file.txt"
h = HuffmanCoding(path)
output_path = h.compress()
h.decompress(output_path)

892
Compressed Succesfully
Decompressed Succesfully
Out path test_file_decompressed.txt


In [None]:
path = "test_file.txt"
h = HuffmanCoding(path,i=True)
output_path = h.compress()
h.decompress(output_path)

In [37]:
h._HuffmanCoding__codes

{'m': '00000',
 'y': '000010',
 '5': '00001100',
 'O': '00001101',
 '$': '0000111',
 'b': '000100',
 '0': '000101',
 'd': '00011',
 'c': '00100',
 '.': '0010100',
 'W': '00101010',
 'D': '001010110',
 'N': '001010111',
 'u': '001011',
 'o': '0011',
 'a': '0100',
 'i': '0101',
 't': '0110',
 'p': '011100',
 'v': '011101',
 'w': '0111100',
 'Z': '01111010',
 'F': '01111011',
 '6': '0111110',
 'x': '01111110',
 'K': '01111111',
 'n': '1000',
 'I': '10010000',
 '7': '10010001',
 'G': '10010010',
 'V': '10010011',
 'X': '10010100',
 'U': '10010101',
 'k': '1001011',
 'l': '10011',
 ' ': '101',
 'h': '11000',
 '"': '1100100',
 ',': '1100101',
 'g': '110011',
 'e': '1101',
 'f': '1110000',
 "'": '111000100',
 'S': '111000101',
 '2': '11100011',
 '1': '11100100',
 'H': '111001010',
 '-': '1110010110',
 '4': '1110010111',
 '8': '111001100',
 'B': '111001101',
 'R': '11100111',
 'A': '1110100000',
 '=': '1110100001',
 '/': '1110100010',
 'C': '1110100011',
 'M': '11101001',
 'L': '111010100',
 '