In [2]:
from scipy import stats
import numpy as np
from matplotlib import pyplot as plt
import os



```
# Definition of variables for the task
# Representative member: Andrei Shchapaniak, 14.05.2002
```



In [3]:
K = 14
L = len('Shchapaniak')
X = ((K*L*23) % 20) + 1
Y = ((X + ((K*5 + L*7) % 19)) % 20) + 1

file1 = f'{X:03}.txt'
file2 = f'{Y:03}.txt'
print(f'Files: {file1}, {file2}')

Files: 003.txt, 018.txt


In [8]:
class MyFile:
  def __init__(self, filename):
    self.filename = filename
    self.text = ""
    self.letters = []
    self.letters_pbt = []

  def read_text(self):
      with open(os.path.join('files', self.filename), 'r') as f:
          self.text = f.readlines()[1].strip()

  def cal_letters_pbt(self):
      self.letters, letters_counts = np.unique(list(self.text), return_counts=True)
      self.letters_pbt = letters_counts / np.sum(letters_counts)

  def get_dict_ch_pbt(self):
    return {ch: pbt for ch, pbt in zip(self.letters, self.letters_pbt)}

  def draw_rel_freq(self):
    plt.style.use('classic')
    fig, ax = plt.subplots(figsize=(10, 6))

    bar_spacing = 1.5
    bar_positions = (np.arange(len(self.letters)) * bar_spacing) - (bar_spacing / 2)
    bars = ax.bar(bar_positions, self.letters_pbt, color='plum')

    fig.tight_layout()
    plt.xticks(ticks=bar_positions, labels=self.letters, rotation=45)
    plt.title("Probability of characters in the text")
    plt.xlabel("Character")
    plt.ylabel("Probability")

    plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.15)
    plt.savefig("myfile1.pdf")

  def entropy(self, base = 2):
    return stats.entropy(self.letters_pbt) / np.log(base)

In [16]:
import heapq
from collections import defaultdict

class Huffman:
  def __init__(self, char_pbt):
    self.char_pbt = char_pbt
    self.huffman_code = []

  def create_code(self):
    heap = [[pbt, [ch, ""]] for ch, pbt in self.char_pbt.items()]
    heapq.heapify(heap)

    while len(heap) > 1:
      child1 = heapq.heappop(heap)
      child2 = heapq.heappop(heap)

      for pair in child1[1:]:
        pair[1] = '0' + pair[1]
      for pair in child2[1:]:
        pair[1] = '1' + pair[1]

      heapq.heappush(heap, [child1[0] + child2[0]] + child1[1:] + child2[1:])

    self.huffman_code = sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))

  def calc_average_length(self):
    return sum(len(code) * self.char_pbt[ch] for ch, code in self.huffman_code)


In [19]:
import pandas as pd

myfile1 = MyFile(file1)
myfile2 = MyFile(file2)

myfile1.read_text()
myfile2.read_text()

myfile1.cal_letters_pbt()
myfile2.cal_letters_pbt()
#myfile1.draw_rel_freq()
#myfile2.draw_rel_freq()
entropy1 = myfile1.entropy()
entropy2 = myfile2.entropy()


huffman1 = Huffman(myfile1.get_dict_ch_pbt())
huffman_codes = huffman1.create_code()
avg_len1 = huffman1.calc_average_length()

huffman2 = Huffman(myfile2.get_dict_ch_pbt())
huffman_codes = huffman2.create_code()
avg_len2 = huffman2.calc_average_length()


pd.DataFrame({'003.txt': (entropy1, avg_len1), '018.txt': (entropy2, avg_len2)}, index=['Entropy', 'Average code length'])

Unnamed: 0,003.txt,018.txt
Entropy,4.067065,4.082571
Average code length,4.103814,4.13106
