# Imports

In [74]:
import re
import numpy as np
import math
import zlib


# Read text

In [75]:
def read_text(path):
    lines = ''

    with open(path, 'r', encoding='utf-8') as file:
        lines = file.read()

    return lines

def write_text(path, lines):
    with open(path, 'w', encoding='utf-8') as file:
        file.write(lines)


path = '.\\texts\\roksolana.txt'
# path = '.\\texts\\tiger_catchers.txt'
text = read_text(path).lower()

formatted_text = re.sub(u'[^а-яіґєї]|э|ы|ъ', '', text)
formatted_text = re.sub(u'ґ', 'г', formatted_text)

text_len = len(formatted_text)

write_text('.\\texts\\formatted_text.txt', formatted_text)

# Gathering monograms and bigrams statistics

In [76]:
alphabet_str = 'абвгдеєжзиіїйклмнопрстуфхцчшщьюя'
alphabet_length = len(alphabet_str)

alphabet_statistics = {}
alphabet_frequencies = {}

for letter in alphabet_str:
    alphabet_statistics[letter] = 0

for letter in formatted_text:
    alphabet_statistics[letter] += 1

for letter in alphabet_statistics.keys():
    alphabet_frequencies[letter] = alphabet_statistics[letter] / text_len

monogram_to_number = {}
for i in range(alphabet_length):
	monogram_to_number[alphabet_str[i]] = i


In [77]:
all_bigrams = []

bigrams_statistics = {}
bigrams_frequencies = {}

for first in alphabet_str:
    for second in alphabet_str:
        bigram = first + second

        all_bigrams.append(bigram)
        bigrams_statistics[bigram] = 0
        bigrams_frequencies[bigram] = 0

for index in range(text_len - 1):
    bigrams_statistics[formatted_text[index] + formatted_text[index + 1]] += 1

for bigram in bigrams_frequencies.keys():
    bigrams_frequencies[bigram] = bigrams_statistics[bigram] / (text_len - 1)

all_bigrams_length = len(all_bigrams)

bigram_to_number = {}
for i in range(all_bigrams_length):
	bigram_to_number[all_bigrams[i]] = i


## Entropy and Coincidence Index

In [78]:
monogram_entropy = -1 * sum([alphabet_frequencies[letter] * np.log2(alphabet_frequencies[letter]) for letter in alphabet_str])
bigram_entropy = -0.5 * sum([bigrams_frequencies[bigram] * np.log2(bigrams_frequencies[bigram]) if bigrams_frequencies[bigram] != 0 else 0 for bigram in all_bigrams])

coincidence_index = sum([
	alphabet_statistics[letter] * (alphabet_statistics[letter] - 1)
for letter in alphabet_str]) / text_len / (text_len - 1)


## Generating texts

In [79]:
texts_count = 10000

def generate_texts(text_size, texts_count, step):
	text_array = []

	index = 0
	for n in range(texts_count):
		text_array.append(formatted_text[index:(index + text_size)])
		index += step

	return text_array

texts_10 = generate_texts(10, 10000, 10)
texts_100 = generate_texts(100, 10000, 100)
texts_1000 = generate_texts(1000, 10000, 100)
texts_10000 = generate_texts(10000, 1000, 1000)


# Distortion algorithms

## Vigenere cipher

In [80]:
def mono_vigenere_cipher(text, key):
    key_length = len(key)

    ciphered_text = ''
    for index in range(len(text)):
        key_index = index % key_length
        ciphered_text += alphabet_str[(monogram_to_number[text[index]] + key[key_index]) % alphabet_length]

    return ciphered_text

def bi_vigenere_cipher(text, key):
    key_length = len(key)

    ciphered_text = ''

    for index in range(0, len(text), 2):
        key_index = (index // 2) % key_length
        ciphered_text += all_bigrams[(bigram_to_number[text[index:index+2]] + key[key_index]) % all_bigrams_length]

    return ciphered_text


## Affine cipher

In [81]:
def mono_affine_cipher(text, key):
	a, b = key
	ciphered_text = ''

	for letter in text:
		ciphered_letter_index = (a * monogram_to_number[letter] + b) % alphabet_length

		ciphered_text += alphabet_str[ciphered_letter_index]

	return ciphered_text


def bi_affine_cipher(text, key):
	a, b = key
	ciphered_text = ''

	for bigram in re.findall(r'..', text):
		ciphered_letter_index = (a * bigram_to_number[bigram] + b) % (all_bigrams_length)

		ciphered_text += all_bigrams[ciphered_letter_index]

	return ciphered_text


## Uniform distortion

In [82]:
def mono_uniform_distortion(size):
    return ''.join([alphabet_str[elem] for elem in np.random.randint(low=0, high=alphabet_length, size=size)])

def bi_uniform_distortion(size):
    return ''.join([all_bigrams[elem] for elem in np.random.randint(low=0, high=all_bigrams_length, size=size)])


## Recurrent sequence

In [83]:
def generate_random_lgram(high = alphabet_length, size = 1):
	return np.random.randint(low = 0, high = high, size = size)

def mono_recurrent_sequence(size):
	s_0, s_1 = generate_random_lgram(alphabet_length, 2)
	recurrent_sequence = alphabet_str[s_0] + alphabet_str[s_1]
	
	for i in range(2, size):
		prev_letter_index = monogram_to_number[recurrent_sequence[i - 1]]
		prev_prev_letter_index = monogram_to_number[recurrent_sequence[i - 2]]
		next_letter_index = (prev_letter_index + prev_prev_letter_index) % alphabet_length

		recurrent_sequence += alphabet_str[next_letter_index]

	return recurrent_sequence

def bi_recurrent_sequence(size):
	s_0, s_1 = generate_random_lgram(all_bigrams_length, 2)
	recurrent_sequence = all_bigrams[s_0] + all_bigrams[s_1]
	
	for i in range(2, size // 2):
		prev_letter_index = bigram_to_number[recurrent_sequence[-2:]]
		prev_prev_letter_index = bigram_to_number[recurrent_sequence[-4:-2]]

		next_bigram_index = (prev_letter_index + prev_prev_letter_index) % (all_bigrams_length)

		recurrent_sequence += all_bigrams[next_bigram_index]

	return recurrent_sequence


# Criterias

In [84]:
def get_prohibited_grams(grams, quartile):
	entries = grams.items()
	return dict(sorted(entries, key = lambda tuple: tuple[1], reverse=True)[math.floor(quartile * len(entries)):])

prohibited_monograms = get_prohibited_grams(alphabet_statistics, 0.9)
prohibited_bigrams = get_prohibited_grams(bigrams_statistics, 0.75)


### Criteria 1.1 (criteria 1.0 in parameters)

In [85]:
def mono_criteria_1_1(text, k_prohibited):
    a_ap = set(text)

    k_counter = 0

    for elem in a_ap:
        if elem in list(prohibited_monograms.keys()):
            k_counter += 1

    if (k_counter >= k_prohibited):
        return 0
    else:
        return 1 

def bi_criteria_1_1(text, k_prohibited):
    bigrams_text = [text[i] + text[i+1] for i in range(len(text) - 1)]
    a_ap = set(bigrams_text)

    k_counter = 0

    for elem in a_ap:
        if elem in list(prohibited_bigrams.keys()):
            k_counter += 1

    if (k_counter >= k_prohibited):
        return 0
    else:
        return 1


### Criteria 1.2

In [86]:
def mono_prohibited_frequencies_1(text, limit):
	text_length = len(text)
	prohibited_frequencies = {}

	for monogram in prohibited_monograms.keys():
		prohibited_frequencies[monogram] = 0

	for letter in text:
		if letter in prohibited_monograms.keys():
			prohibited_frequencies[letter] += 1

	for monogram in prohibited_frequencies.keys():
		prohibited_frequencies[monogram] /= text_length

		if (prohibited_frequencies[monogram] >= limit):
			return 0
		
	return 1

def bi_prohibited_frequencies_1(text, limit):
	text_length = len(text)
	prohibited_frequencies = {}

	for bigram in prohibited_bigrams.keys():
		prohibited_frequencies[bigram] = 0

	for i in range(text_length - 1):
		bigram = text[i:i + 2]

		if bigram in prohibited_bigrams.keys():
			prohibited_frequencies[bigram] += 1

	for bigram in prohibited_frequencies.keys():
		prohibited_frequencies[bigram] /= (text_length - 1)

		if (prohibited_frequencies[bigram] >= limit):
			return 0
		
	return 1


### Criteria 1.3

In [87]:
def mono_prohibited_frequencies_2(text, limit):
	text_length = len(text)
	prohibited_frequencies = {}

	for monogram in prohibited_monograms.keys():
		prohibited_frequencies[monogram] = 0

	for letter in text:
		if letter in prohibited_monograms.keys():
			prohibited_frequencies[letter] += 1

	freq_sum = sum(prohibited_frequencies.values()) / text_length

	if freq_sum > limit:
		return 0
		
	return 1

def bi_prohibited_frequencies_2(text, limit):
	text_length = len(text)
	prohibited_frequencies = {}

	for bigram in prohibited_bigrams.keys():
		prohibited_frequencies[bigram] = 0

	for i in range(text_length - 1):
		bigram = text[i:i + 2]

		if bigram in prohibited_bigrams.keys():
			prohibited_frequencies[bigram] += 1

	freq_sum = sum(prohibited_frequencies.values()) / (text_length - 1)

	if freq_sum > limit:
		return 0
		
	return 1

print(mono_prohibited_frequencies_2('вознесіння', 1))
print(bi_prohibited_frequencies_2('уцефвікеф', 1))

1
1


### Criteria 3.0

In [88]:
def get_monogram_distribution(text):
    text_length = len(text)
    stats = {}

    for letter in alphabet_str:
        stats[letter] = 0
    for elem in text:
        stats[elem] += 1
    for key in stats.keys():
        stats[key] /= text_length
    return stats

def get_bigram_distribution(text):
    text_length = len(text)
    stats = {}

    for bigram in all_bigrams:
        stats[bigram] = 0
            
    for i in range(len(text) - 1):
        stats[text[i] + text[i + 1]] += 1

    for key in stats.keys():
        stats[key] /= (text_length - 1)

    return stats

def get_specific_entropy(frequencies, l):
    specific_entropy = 0

    for frequency in frequencies.keys():
        if frequencies[frequency] != 0:
            specific_entropy -= frequencies[frequency] * np.log2(frequencies[frequency]) / l

    return specific_entropy

def mono_criteria_3_0(text, limit):
    text_specific_entropy = get_specific_entropy(get_monogram_distribution(text), 1)

    result = abs(monogram_entropy - text_specific_entropy)

    if result > limit:
        return 0
    else:
        return 1

def bi_criteria_3_0(text, limit):
    text_specific_entropy = get_specific_entropy(get_bigram_distribution(text), 2)

    result = abs(bigram_entropy - text_specific_entropy)

    if result > limit:
        return 0
    else:
        return 1


### Criteria 5.0

In [89]:
def get_most_common_grams(grams, count):
	entries = grams.items()
	sorted_entries = sorted(entries, key = lambda tuple: tuple[1], reverse=True)[:count]
	
	return list(map(lambda item: item[0], sorted_entries))


In [90]:
most_common_monograms = get_most_common_grams(alphabet_statistics, 10)
most_common_bigrams_50 = get_most_common_grams(bigrams_statistics, 50)
most_common_bigrams_100 = get_most_common_grams(bigrams_statistics, 100)
most_common_bigrams_200 = get_most_common_grams(bigrams_statistics, 200)

def mono_empty_boxes(text, most_common_monograms, limit):
	most_common_monograms_in_text = {}

	for monogram in most_common_monograms:
		most_common_monograms_in_text[monogram] = 0

	for letter in text:
		if letter in most_common_monograms:
			most_common_monograms_in_text[letter] += 1

	if len(list(filter(lambda monogram: True if monogram[1] == 0 else False, most_common_monograms_in_text.items()))) > limit:
		return 0

	return 1

def bi_empty_boxes(text, most_common_bigrams, limit):
	most_common_bigrams_in_text = {}

	for monogram in most_common_bigrams.key():
		most_common_bigrams_in_text[monogram] = 0

	for i in range(len(text) - 1):
		bigram = text[i:i + 2]

		if bigram in most_common_bigrams.keys():
			most_common_bigrams_in_text[bigram] += 1

	if len(list(filter(lambda bigram: True if bigram[1] == 0 else False, most_common_bigrams_in_text.items()))) > limit:
		return 0

	return 1


### Structure criteria

In [91]:
def structure_criteria(text):
	text_len = len(text)
	random_text = mono_uniform_distortion(text)

	random_coef = text_len / len(zlib.compress(random_text.encode('utf-8')))
	real_coef = text_len / len(zlib.compress(text.encode('utf-8')))

	if abs(random_coef - real_coef) < 0.25:
		return 0

	return 1


# Distortion texts

## Monograms

In [92]:
viginere_key_1 = generate_random_lgram(alphabet_length)
viginere_key_5 = generate_random_lgram(alphabet_length, 5)
viginere_key_10 = generate_random_lgram(alphabet_length, 10)

affine_key = generate_random_lgram(alphabet_length, 2)

while affine_key[0] % 2 != 1:
	affine_key = generate_random_lgram(alphabet_length, 2)

text_10_mono = {
	'viginere_1': [mono_vigenere_cipher(text, viginere_key_1) for text in texts_10],
	'viginere_5': [mono_vigenere_cipher(text, viginere_key_5) for text in texts_10],
	'viginere_10': [mono_vigenere_cipher(text, viginere_key_10) for text in texts_10],
	'affine': [mono_affine_cipher(text, affine_key) for text in texts_10],
	'random': [mono_uniform_distortion(10) for i in range(len(texts_10))],
	'reccurent': [mono_recurrent_sequence(10) for i in range(len(texts_10))]
}

print('finish 10')

text_100_mono = {
	'viginere_1': [mono_vigenere_cipher(text, viginere_key_1) for text in texts_100],
	'viginere_5': [mono_vigenere_cipher(text, viginere_key_5) for text in texts_100],
	'viginere_10': [mono_vigenere_cipher(text, viginere_key_10) for text in texts_100],
	'affine': [mono_affine_cipher(text, affine_key) for text in texts_100],
	'random': [mono_uniform_distortion(100) for i in range(len(texts_100))],
	'reccurent': [mono_recurrent_sequence(100) for i in range(len(texts_100))]
}

print('finish 100')

text_1000_mono = {
	'viginere_1': [mono_vigenere_cipher(text, viginere_key_1) for text in texts_1000],
	'viginere_5': [mono_vigenere_cipher(text, viginere_key_5) for text in texts_10000],
	'viginere_10': [mono_vigenere_cipher(text, viginere_key_10) for text in texts_1000],
	'affine': [mono_affine_cipher(text, affine_key) for text in texts_1000],
	'random': [mono_uniform_distortion(1000) for i in range(len(texts_1000))],
	'reccurent': [mono_recurrent_sequence(1000) for i in range(len(texts_1000))]
}

print('finish 1000')

text_10000_mono = {
	'viginere_1': [mono_vigenere_cipher(text, viginere_key_1) for text in texts_10000],
	'viginere_5': [mono_vigenere_cipher(text, viginere_key_5) for text in texts_10000],
	'viginere_10': [mono_vigenere_cipher(text, viginere_key_10) for text in texts_10000],
	'affine': [mono_affine_cipher(text, affine_key) for text in texts_10000],
	'random': [mono_uniform_distortion(10000) for i in range(len(texts_10000))],
	'reccurent': [mono_recurrent_sequence(10000) for i in range(len(texts_10000))]
}

print('finish 10000')

finish 10
finish 100
finish 1000
finish 10000


## Bigrams

In [93]:
viginere_key_1 = generate_random_lgram(all_bigrams_length)
viginere_key_5 = generate_random_lgram(all_bigrams_length, 5)
viginere_key_10 = generate_random_lgram(all_bigrams_length, 10)

affine_key = generate_random_lgram(all_bigrams_length, 2)

while affine_key[0] % 2 != 1:
	affine_key = generate_random_lgram(all_bigrams_length, 2)

text_10_bi = {
	'viginere_1': [bi_vigenere_cipher(text, viginere_key_1) for text in texts_10],
	'viginere_5': [bi_vigenere_cipher(text, viginere_key_5) for text in texts_10],
	'viginere_10': [bi_vigenere_cipher(text, viginere_key_10) for text in texts_10],
	'affine': [bi_affine_cipher(text, affine_key) for text in texts_10],
	'random': [bi_uniform_distortion(10) for i in range(len(texts_10))],
	'reccurent': [bi_recurrent_sequence(10) for i in range(len(texts_10))]
}

text_100_bi = {
	'viginere_1': [bi_vigenere_cipher(text, viginere_key_1) for text in texts_100],
	'viginere_5': [bi_vigenere_cipher(text, viginere_key_5) for text in texts_100],
	'viginere_10': [bi_vigenere_cipher(text, viginere_key_10) for text in texts_100],
	'affine': [bi_affine_cipher(text, affine_key) for text in texts_100],
	'random': [bi_uniform_distortion(100) for i in range(len(texts_100))],
	'reccurent': [bi_recurrent_sequence(100) for i in range(len(texts_100))]
}

text_1000_bi = {
	'viginere_1': [bi_vigenere_cipher(text, viginere_key_1) for text in texts_1000],
	'viginere_5': [bi_vigenere_cipher(text, viginere_key_5) for text in texts_1000],
	'viginere_10': [bi_vigenere_cipher(text, viginere_key_10) for text in texts_1000],
	'affine': [bi_affine_cipher(text, affine_key) for text in texts_1000],
	'random': [bi_uniform_distortion(1000) for i in range(len(texts_1000))],
	'reccurent': [bi_recurrent_sequence(1000) for i in range(len(texts_1000))]
}

text_10000_bi = {
	'viginere_1': [bi_vigenere_cipher(text, viginere_key_1) for text in texts_10000],
	'viginere_5': [bi_vigenere_cipher(text, viginere_key_5) for text in texts_10000],
	'viginere_10': [bi_vigenere_cipher(text, viginere_key_10) for text in texts_10000],
	'affine': [bi_affine_cipher(text, affine_key) for text in texts_10000],
	'random': [bi_uniform_distortion(10000) for i in range(len(texts_10000))],
	'reccurent': [bi_recurrent_sequence(10000) for i in range(len(texts_10000))]
}


finish 10
finish 100
finish 1000
finish 10000


# Criteria 1.0

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria 1.1

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria 1.2

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria 1.3

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria 3.0

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria 5.1

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria ?.?

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000

# Criteria 1.0

## Monogram

### L = 10

### L = 100

### L = 1000

### L = 10000

## Bigram

### L = 10

### L = 100

### L = 1000

### L = 10000