In [2]:
def LZ77_encode(text, window_size=5):
    compressed = []
    pointer = 0

    while pointer < len(text):
        max_match = ''
        length = 0
        i = 1
        tmptext = text[pointer]
        buffer = text[max(0, pointer - window_size):pointer]
        if len(tmptext) < len(buffer):
            while True:
                if (buffer + text[pointer:pointer+max(0, len(tmptext) - len(buffer) + buffer.find(tmptext[0]))]).find(tmptext) != -1 and \
                pointer + i < len(text) + 1:
                    length += 1
                    max_match = tmptext
                    tmptext = text[pointer:pointer+i+1]
                else:
                    break
                i += 1

        if length > 0:
            next_char = text[pointer + length] if pointer + length < len(text) else ''
            offset = len(buffer) - buffer.find(tmptext[0])
            compressed.append((offset, length, next_char))
            pointer += length + 1
        else:
            compressed.append((0, 0, text[pointer]))
            pointer += 1
    compressed[-1] = compressed[-1][0:2] + ('$',)

    return compressed

In [3]:
def LZ77_decode(compressed):
    text = ''
    for item in compressed:
        offset, length, char = item
        if length == 0:
            text += char
        else:
            start = len(text) - offset
            for i in range(length):
                text += text[start + i]
            text += char
    return text

In [4]:
# Пример использования:
input_string = "abracadabra"
encoded_string = LZ77_encode(input_string)
print("Encoded string:", encoded_string)
decoded_string = LZ77_decode(encoded_string)
print("Decoded string:", decoded_string)

input_string = "abacabacabadaca"
encoded_string = LZ77_encode(input_string)
print("Encoded string:", encoded_string)
decoded_string = LZ77_decode(encoded_string)
print("Decoded string:", decoded_string)

Encoded string: [(0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'), (3, 1, 'c'), (5, 1, 'd'), (4, 1, 'b'), (0, 0, 'r'), (5, 1, '$')]
Decoded string: abracadabra$
Encoded string: [(0, 0, 'a'), (0, 0, 'b'), (2, 1, 'c'), (4, 7, 'd'), (4, 1, 'c'), (4, 1, '$')]
Decoded string: abacabacabadaca$


In [5]:
def LZ77_to_file(filename, encoded_data):
    with open(filename, 'wb') as file:
        for item in encoded_data:
            file.write(item[0].to_bytes(2, byteorder='big'))
            file.write(item[1].to_bytes(2, byteorder='big'))
            file.write(ord(item[2]).to_bytes(2, byteorder='big'))
    return

In [6]:
def LZ77_from_file(filename):
    with open(filename, 'rb') as file:
        encoded_data = []
        while True:
            offset = file.read(2)
            if not offset:
                break
            offset = int.from_bytes(offset, "big")
            length = int.from_bytes(file.read(2), "big")
            next_char = chr(int.from_bytes(file.read(2), "big"))
            encoded_data.append((offset, length, next_char))
    return encoded_data

In [13]:
import heapq
from collections import Counter, defaultdict

class HuffmanNode:
    def __init__(self, value, frequency):
        self.value = value
        self.frequency = frequency
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.frequency < other.frequency

def build_huffman_tree(text):
    frequency_counter = Counter(text)
    priority_queue = [HuffmanNode(symbol, freq) for symbol, freq in frequency_counter.items()]
    heapq.heapify(priority_queue)

    while len(priority_queue) > 1:
        left_child = heapq.heappop(priority_queue)
        right_child = heapq.heappop(priority_queue)

        merged_node = HuffmanNode(None, left_child.frequency + right_child.frequency)
        merged_node.left = left_child
        merged_node.right = right_child

        heapq.heappush(priority_queue, merged_node)

    return priority_queue[0]

def build_huffman_codes(node, prefix="", codes={}):
    if node:
        if node.value is not None:
            codes[node.value] = prefix
        build_huffman_codes(node.left, prefix + "0", codes)
        build_huffman_codes(node.right, prefix + "1", codes)
    return codes

def huffman_encode(text, codes):
    encoded_text = ""
    for symbol in text:
        encoded_text += codes[symbol]
    return encoded_text

def huffman_decode(encoded_text, tree):
    decoded_text = ""
    current_node = tree
    for bit in encoded_text:
        if bit == '0':
            current_node = current_node.left
        else:
            current_node = current_node.right

        if current_node.value is not None:
            decoded_text += current_node.value
            current_node = tree
    return decoded_text

In [7]:
def read_from_file(filename):
    string = ''
    with open(filename, "r", encoding='utf-8') as file:
        for line in file:
            string += line
    return string

In [8]:
string = read_from_file('harry_potter_and_the_prisoner_of_azkaban_and_harry_potter_and_the_sorcerers_stone.txt')

In [9]:
encoded_string = LZ77_encode(string, window_size=60000)

In [11]:
string = ''
for item in encoded_string:
    string += str(item[0]) + str(item[1]) + item[2]

In [14]:
tree = build_huffman_tree(string)
codes = build_huffman_codes(tree)
encoded_text = huffman_encode(string, codes)

In [15]:
def huffman_to_file(filename, encoded_text):
    with open(filename, "wb") as file:
        for sub in range(0, len(encoded_text), 8):
            byte = int(encoded_text[sub:sub+8],  2)
            file.write(byte.to_bytes(1, byteorder='big'))
        return

In [16]:
huffman_to_file('encoded_harry_potter_and_the_prisoner_of_azkaban_and_harry_potter_and_the_sorcerers_stone.txt', encoded_text)