In [1]:
#Kompresja danych
#Standardowo każdy znak zajmuje 8 bitów - 1 bajt

#W przykładzie plik składa się z 37 różnych znaków => wystarczy 2^n bitów, 
#gdzie 2^n >= 37 => n = 6;

#kazdy znak kodowany na 6 bitach
#stopień kompresji 1 - 6/8 = 1/4

In [2]:
import bitarray as ba
import operator


In [3]:
def read_file(file):
    return open(file).read()

In [4]:
def save_file(file_name, code):
    result = open(file_name, 'w')
    
    for text, num in code.items():
        result.write(text + ";" + str(num) + ";")
    
    result.close()
    return result

In [5]:
def save_file_bin(file_name, bit_array):
    with open(file_name, 'wb') as fh:
        bit_array.tofile(fh)
    return 1

In [6]:
def read_file_bin(file_name):
    bit_array = ba.bitarray()
    with open(file_name, 'rb') as fh:
        bit_array.fromfile(fh)
    return bit_array

In [7]:
def read_bitarray(text):
    array = ba.bitarray()
    print(str.encode(text))
    array.frombytes(str.encode(text))

    return array    

In [8]:
def count_chars(text):
    chars = {}
    for char in list(text):
        if char in chars:
            chars[char] += 1
        else:
            chars[char] = 1
    return chars


In [9]:
#Tworzy kod na podstawie listy częstości znaków
def create(chars):
    code = {}
    
    sorted_chars = sorted(chars.items(), key=operator.itemgetter(1), reverse=True)
    i = 0
    for char, frequency in sorted_chars:
        code[char] = i
        i += 1
        
    return code

In [10]:
#Koduje tekst
def encode(text, code):
    
    coded_text = ba.bitarray()
    for char in text:
        binary = bin(code[char])
        binary = binary[2:].zfill(6)
        
        for b in binary:
            if b == '1':
                coded_text.append(True)
            else:
                coded_text.append(False)
    
    return coded_text

In [11]:
import struct

In [12]:
#Dekoduje tekst
def decode(coded_text, code):
    text = []
    codeSwaped = dict((v,k) for k,v in code.items())

    for i in range(0, len(coded_text), 6):
        bits = coded_text[i:i+6]
        x = int(bits.to01(), 2)
        text.append(codeSwaped[str(x)])

    return ''.join(text) 

In [13]:
#Zapis kodu i zakodowanego tekstu do pliku
def save(text, code, file_name, file_name_code):
    
    save_file_bin(file_name, text)
    save_file(file_name_code, code)
    
    return 0

In [14]:
#Wczytuje kod oraz tekst
def load(file_name, code_file_name):
    
    code_file = read_file(code_file_name)
    split = code_file.split(";")
    code = {}
    for char, num in zip(split[0::2], split[1::2]):
        code[char] = num
        
    text = read_file_bin(file_name)
    return text, code

In [15]:
file = read_file("norm_wiki_sample.txt")

In [16]:
def compare_strings(original, new_string):
    for org, new in zip(original, new_string):
        if org != new:
            print(org, "!=", new)
            return "Teksty są różne"
    return "Teksty sa równe"

In [19]:
chars = count_chars(file)

In [20]:
code = create(chars)

In [21]:
encoded = encode(file, code)

In [22]:
save(encoded, code, "zakodowany", "kod")

0

In [23]:
coded, code = load("zakodowany", "kod")

In [24]:
decoded = decode(coded, code)

In [25]:
compare_strings(file, decoded)

'Teksty sa równe'