# 3F7 Full Technical Report

**By Diego García Medina**

The code below will improve on the arithmetic coding by using conditional probabilities, given by a pre-trained LSTM network on English language.

In [1]:
f = open('hamlet.txt')
hamlet = open('pride_and_prejudice.txt').read() + f.read()
f.close()
print(hamlet[:294])

The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or onlin


In [2]:
from itertools import groupby
frequencies = dict([(key, len(list(group))) for key, group in groupby(sorted(hamlet))])
Nin = sum([frequencies[a] for a in frequencies])
p = dict([(a,frequencies[a]/Nin) for a in frequencies])

print(f'File length: {Nin}')
print("There are {} characters in the alphabet".format(len(frequencies)))
print("The characters are the following:\n")
for char in frequencies.keys():
    print(char)

File length: 911214
There are 86 characters in the alphabet
The characters are the following:



 
!
"
#
$
%
&
'
(
)
*
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
]
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
|


In [3]:
method = 'conditional-arithmetic'
filename = 'hamlet.txt'

In [4]:
from camzip import camzip
from camunzip import camunzip

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


In [5]:
from filecmp import cmp
from os import stat
from json import load
from math import log2
import arithmetic as arith
import conditional_arithmetic as cond_arith
import time

def run_test_arithmetic(filename,length_of_LSTM):
    method="arithmetic"
    
    f = open('hamlet.txt', 'r')
    hamlet = f.read()
    f.close()
    frequencies = dict([(key, len(list(group))) for key, group in groupby(sorted(hamlet))])
    Nin = sum([frequencies[a] for a in frequencies])
    p = dict([(a,frequencies[a]/Nin) for a in frequencies])
    print(f'File length: {Nin}')
    
    start = time.time()
    Y = arith.encode(hamlet, p)
    print(f"The time taken to compress is: {time.time()-start}s")
    start2 = time.time()
    X = arith.decode(Y, p, Nin)
    print(f"The time taken to dempress is: {time.time()-start2}s")
    
    print("The decoded text is the following:")
    print('\n'+''.join(X[:294]))
    print("\n")
    
    Y_corrupted = Y.copy()
    Y_corrupted[399] ^= 1

    X_corrupted = arith.decode(Y_corrupted, p, Nin)
    
    H = lambda pr: -sum([pr[a]*log2(pr[a]) for a in pr])
    
    Nin = stat(filename).st_size
    print(f'Length of original file: {Nin} bytes')
    Nout = stat(filename + '.cz' + method[0]).st_size
    print(f'Length of compressed file: {Nout} bytes')
    print(f'Compression ratio (rateless): {Nout/Nin}')
    print(f'Compression rate: {8.0*Nout/Nin} bits/byte')
    with open(filename + '.czp', 'r') as fp:
        freq = load(fp)
    pf = dict([(a, freq[a]/Nin) for a in freq])
    print(f'Entropy: {H(pf)} bits per symbol')
    if cmp(filename,filename+'.cuz'):
        print('The two files are the same')
    else:
        print('The files are different')
        
    print("\nThe decoded output from the corrupted compressed file looks like this:")
    print('\n'+''.join(X_corrupted[:294]))
    print("\n")
        
    print("\n\n\n")

In [6]:
def run_test_cond_arithmetic(filename,length_of_LSTM):
    method = "conditional-arithmetic"
    
    f = open('hamlet.txt', 'r')
    hamlet = f.read()
    f.close()
    frequencies = dict([(key, len(list(group))) for key, group in groupby(sorted(hamlet))])
    Nin = sum([frequencies[a] for a in frequencies])
    p = dict([(a,frequencies[a]/Nin) for a in frequencies])
    print(f'File length: {Nin}')

    Y = camzip(method, filename, length_of_LSTM)
    X = camunzip(filename + '.cz' + method[0],length_of_LSTM)
    
    print("The decoded text is the following:")
    print(bytes(X)[:294])
    print("\n")
    
    Y_corrupted = Y.copy()
    Y_corrupted[399] ^= 1

    X_corrupted = cond_arith.decode(Y_corrupted, p, Nin)
    
    H = lambda pr: -sum([pr[a]*log2(pr[a]) for a in pr])
    
    Nin = stat(filename).st_size
    print(f'Length of original file: {Nin} bytes')
    Nout = stat(filename + '.cz' + method[0]).st_size
    print(f'Length of compressed file: {Nout} bytes')
    print(f'Compression ratio (rateless): {Nout/Nin}')
    print(f'Compression rate: {8.0*Nout/Nin} bits/byte')
    with open(filename + '.czp', 'r') as fp:
        freq = load(fp)
    pf = dict([(a, freq[a]/Nin) for a in freq])
    print(f'Entropy: {H(pf)} bits per symbol')
    if cmp(filename,filename+'.cuz'):
        print('The two files are the same')
    else:
        print('The files are different')
        
    print("\nThe decoded output from the corrupted compressed file looks like this:")
    print('\n'+''.join(bytes(X_corrupted)[:294]))
    print("\n")
        
    print("\n\n\n")

In [7]:
def run_tests():
    test_list = []

    test_list.append({"method":"conditional-arithmetic","length_of_LSTM":30,"filename":'hamlet.txt'})
    #test_list.append({"method":"arithmetic","length_of_LSTM":30,"filename":'hamlet.txt'})
    #test_list.append({"method":"conditional-arithmetic","length_of_LSTM":1,"filename":'hamlet.txt'})
    #test_list.append({"method":"conditional-arithmetic","length_of_LSTM":5,"filename":'hamlet.txt'})
    #test_list.append({"method":"conditional-arithmetic","length_of_LSTM":30,"filename":'pride_and_prejudice.txt'})

    for i,test in enumerate(test_list):

        print(f"+++ This is the test n.{i} +++")

        if test["method"] == "arithmetic":
            run_test_arithmetic(test["filename"],test["length_of_LSTM"])
        elif test["method"] == "conditional-arithmetic":
            run_test_cond_arithmetic(test["filename"],test["length_of_LSTM"])

In [8]:
camzip(method, filename)
camunzip(filename + '.cz' + method[0])

The time required for encoding is: 777.6763379573822
The entropy is: 0.000644786291234165
Arithmetic decoded 99%    

[32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 72,
 65,
 77,
 76,
 69,
 84,
 10,
 10,
 10,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 68,
 82,
 65,
 77,
 65,
 84,
 73,
 83,
 32,
 80,
 69,
 82,
 83,
 79,
 78,
 65,
 69,
 10,
 10,
 10,
 67,
 76,
 65,
 85,
 68,
 73,
 85,
 83,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 107,
 105,
 110,
 103,
 32,
 111,
 102,
 32,
 68,
 101,
 110,
 109,
 97,
 114,
 107,
 46,
 32,
 40,
 75,
 73,
 78,
 71,
 32,
 67,
 76,
 65,
 85,
 68,
 73,
 85,
 83,
 58,
 41,
 10,
 10,
 72,
 65,
 77,
 76,
 69,
 84,
 32,
 32,
 115,
 111,
 110,
 32,
 116,
 111,
 32,
 116,
 104,
 101,
 32,
 108,
 97,
 116,
 101,
 44,
 32,
 97,
 110,
 100,
 32,
 110,
 101,
 112,
 104,
 101,
 119,
 32,
 116,
 111,
 32,
 116,
 104,
 101,
 32,
 112,
 114,
 101,
 115,
 101,
 110,
 116,
 32,
 107,
 105,
 110,
 103,
 46,
 10,
 10,
 80,
 79,
 76,
 79,
 78,
 73,
 85,
 83,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 108,
 111,
 114,
 100,
 32,
 99,
 104,
 97,
 109,
 98,
 101,
 114,
 108,
 97,
 105,
 110,
 46,
 32,
 40

In [9]:
Nin = stat(filename).st_size
print(f'Length of original file: {Nin} bytes')
Nout = stat(filename + '.cz' + method[0]).st_size
print(f'Length of compressed file: {Nout} bytes')
print(f'Compression ratio (rateless): {Nout/Nin}')
print(f'Compression rate: {8.0*Nout/Nin} bits/byte')
with open(filename + '.czp', 'r') as fp:
    freq = load(fp)
pf = dict([(a, freq[a]/Nin) for a in freq])
print(f'Entropy: {H(pf)} bits per symbol')
if cmp(filename,filename+'.cuz'):
    print('The two files are the same')
else:
    print('The files are different')

Length of original file: 207039 bytes
Length of compressed file: 56829 bytes
Compression ratio (rateless): 0.2744845174097634
Compression rate: 2.195876139278107 bits/byte


NameError: name 'H' is not defined