In [4]:
def LZ77_encode(text, window_size=5):
    compressed = []
    pointer = 0

    while pointer < len(text):
        max_match = ''
        length = 0
        i = 1
        tmptext = text[pointer]
        buffer = text[max(0, pointer - window_size):pointer]
        if len(tmptext) < len(buffer):
            while True:
                if (buffer + text[pointer:pointer+max(0, len(tmptext) - len(buffer) + buffer.find(tmptext[0]))]).find(tmptext) != -1 and \
                pointer + i < len(text) + 1:
                    length += 1
                    max_match = tmptext
                    tmptext = text[pointer:pointer+i+1]
                else:
                    break
                i += 1

        if length > 0:
            next_char = text[pointer + length] if pointer + length < len(text) else ''
            offset = len(buffer) - buffer.find(tmptext[0])
            compressed.append((offset, length, next_char))
            pointer += length + 1
        else:
            compressed.append((0, 0, text[pointer]))
            pointer += 1
    compressed[-1] = compressed[-1][0:2] + ('$',)

    return compressed

In [5]:
def LZ77_decode(compressed):
    text = ''
    for item in compressed:
        offset, length, char = item
        if length == 0:
            text += char
        else:
            start = len(text) - offset
            for i in range(length):
                text += text[start + i]
            text += char
    return text

In [6]:
# Пример использования:
input_string = "abracadabra"
encoded_string = LZ77_encode(input_string)
print("Encoded string:", encoded_string)
decoded_string = LZ77_decode(encoded_string)
print("Decoded string:", decoded_string)

input_string = "abacabacabadaca"
encoded_string = LZ77_encode(input_string)
print("Encoded string:", encoded_string)
decoded_string = LZ77_decode(encoded_string)
print("Decoded string:", decoded_string)

Encoded string: [(0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'), (3, 1, 'c'), (5, 1, 'd'), (4, 1, 'b'), (0, 0, 'r'), (5, 1, '$')]
Decoded string: abracadabra$
Encoded string: [(0, 0, 'a'), (0, 0, 'b'), (2, 1, 'c'), (4, 7, 'd'), (4, 1, 'c'), (4, 1, '$')]
Decoded string: abacabacabadaca$


In [7]:
def LZ77_to_file(filename, encoded_data):
    with open(filename, 'wb') as file:
        for item in encoded_data:
            file.write(item[0].to_bytes(2, byteorder='big'))
            file.write(item[1].to_bytes(2, byteorder='big'))
            file.write(ord(item[2]).to_bytes(2, byteorder='big'))
    return

In [8]:
def LZ77_from_file(filename):
    with open(filename, 'rb') as file:
        encoded_data = []
        while True:
            offset = file.read(2)
            if not offset:
                break
            offset = int.from_bytes(offset, "big")
            length = int.from_bytes(file.read(2), "big")
            next_char = chr(int.from_bytes(file.read(2), "big"))
            encoded_data.append((offset, length, next_char))
    return encoded_data

In [9]:
def arithmetic_coding_encode(input_string, frequency_dict, step=4):
    # Инициализируем начальные значения
    low = 0.0
    high = 1.0
    values_list = []

    i = 0
    # Проходимся по каждому символу в строке
    for symbol in input_string:

        # Обновляем границы интервала
        symbol_index = list(frequency_dict.keys()).index(symbol)
        new_low = low + (high - low) * (list(frequency_dict.values())[symbol_index - 1] if symbol_index != 0 else 0)
        new_high = low + (high - low) * list(frequency_dict.values())[symbol_index]

        # Обновляем нижнюю границу
        low = new_low

        # Обновляем верхнюю границу
        high = new_high

        i += 1
        if i == step or step * len(values_list) + i == len(input_string):
            values_list.append((low + high) / 2)
            low = 0.0
            high = 1.0
            i = 0

    return values_list

In [10]:
from collections import Counter

def build_frequency_dict(text):
    frequency_dict = Counter(text)

    num_frequency_dict = {}
    point = 1 / sum(frequency_dict.values())
    num = 0
    for key in frequency_dict:
        num += point * frequency_dict[key]
        num_frequency_dict[key] = num

    return num_frequency_dict

In [11]:
import struct

def arithmetic_to_file(filename, encoded_text, dictionary, text_len):
    with open(filename, 'wb') as file:
        file.write(text_len.to_bytes(6, byteorder='big'))
        for item in encoded_text:
            file.write(bytearray(struct.pack("f", item)))
        for key in dictionary:
            file.write(ord(key).to_bytes(2, byteorder='big'))
            file.write(bytearray(struct.pack("f", dictionary[key])))
    return

In [12]:
def read_from_file(filename):
    string = ''
    with open(filename, "r", encoding='utf-8') as file:
        for line in file:
            string += line
    return string

In [13]:
string = read_from_file('harry_potter_and_the_prisoner_of_azkaban_and_harry_potter_and_the_sorcerers_stone.txt')

In [14]:
encoded_string = LZ77_encode(string, window_size=60000)

In [15]:
string = ''
for item in encoded_string:
    string += str(item[0]) + str(item[1]) + item[2]

In [16]:
dictionary = build_frequency_dict(string)
encoded_text = arithmetic_coding_encode(string, dictionary)

In [18]:
arithmetic_to_file('encoded_harry_potter_and_the_prisoner_of_azkaban_and_harry_potter_and_the_sorcerers_stone.txt', encoded_text, dictionary, len(string))