# Imports

In [None]:
import re
import numpy as np
import math
import zlib


# Read text

In [None]:
def read_text(path):
    lines = ''

    with open(path, 'r', encoding='utf-8') as file:
        lines = file.read()

    return lines

def write_text(path, lines):
    with open(path, 'w', encoding='utf-8') as file:
        file.write(lines)


path = '.\\texts\\roksolana.txt'
# path = '.\\texts\\tiger_catchers.txt'
text = read_text(path).lower()

formatted_text = re.sub(u'[^а-яіґєї]|э|ы|ъ', '', text)
formatted_text = re.sub(u'ґ', 'г', formatted_text)

text_len = len(formatted_text)

write_text('.\\texts\\formatted_text.txt', formatted_text)

# Gathering monograms and bigrams statistics

In [None]:
alphabet_str = 'абвгдеєжзиіїйклмнопрстуфхцчшщьюя'
alphabet_length = len(alphabet_str)

alphabet_statistics = {}
alphabet_frequencies = {}

for letter in alphabet_str:
    alphabet_statistics[letter] = 0

for letter in formatted_text:
    alphabet_statistics[letter] += 1

for letter in alphabet_statistics.keys():
    alphabet_frequencies[letter] = alphabet_statistics[letter] / text_len


In [None]:
all_bigrams = []

bigrams_statistics = {}
bigrams_frequencies = {}

for first in alphabet_str:
    for second in alphabet_str:
        bigram = first + second

        all_bigrams.append(bigram)
        bigrams_statistics[bigram] = 0
        bigrams_frequencies[bigram] = 0

for index in range(text_len - 1):
    bigrams_statistics[formatted_text[index] + formatted_text[index + 1]] += 1

for bigram in bigrams_frequencies.keys():
    bigrams_frequencies[bigram] = bigrams_statistics[bigram] / (text_len - 1)


## Entropy and Coincidence Index

In [None]:
monogram_entropy = -1 * sum([alphabet_frequencies[letter] * np.log2(alphabet_frequencies[letter]) for letter in alphabet_str])
bigram_entropy = -0.5 * sum([bigrams_frequencies[bigram] * np.log2(bigrams_frequencies[bigram]) if bigrams_frequencies[bigram] != 0 else 0 for bigram in all_bigrams])

coincidence_index = sum([
	alphabet_statistics[letter] * (alphabet_statistics[letter] - 1)
for letter in alphabet_str]) / text_len / (text_len - 1)

bigram_entropy


## Generating texts

In [None]:
texts_count = 10000

def generate_texts(text_size, texts_count):
	text_array = []

	index = 0
	for n in range(texts_count):
		text_array.append(formatted_text[index:(index + text_size)])
		index += 100

	return text_array

texts_10 = generate_texts(10, 10000)
texts_100 = generate_texts(100, 10000)
texts_1000 = generate_texts(1000, 10000)
texts_10000 = generate_texts(10000, 1000)


# Distortion algorithms

## Vigenere cipher

In [None]:
def vigenere_cipher(text, key):
    alphabet = list(alphabet_statistics.keys())

    length = len(alphabet)
    key_length = len(key)

    ciphered_text = ''
    for index in range(len(text)):
        key_index = index % key_length
        ciphered_text += alphabet[(alphabet.index(text[index]) + alphabet.index(key[key_index])) % length]

    return ciphered_text


## Affine cipher

In [None]:
def mono_affine_cipher(text, key):
	a, b = key
	ciphered_text = ''

	for letter in text:
		plain_text_index = alphabet_str.index(letter)
		ciphered_letter_index = (a * plain_text_index + b) % alphabet_length

		ciphered_text += alphabet_str[ciphered_letter_index]

	return ciphered_text


def bi_affine_cipher(text, key):
	a, b = key
	ciphered_text = ''

	for bigram in re.findall(r'..', text):
		print(bigram)
		plain_text_index = all_bigrams.index(bigram)
		ciphered_letter_index = (a * plain_text_index + b) % (alphabet_length ** 2)

		ciphered_text += all_bigrams[ciphered_letter_index]

	return ciphered_text


## Uniform distortion

In [None]:
def mono_uniform_distortion(text):
    alphabet = list(alphabet_statistics.keys())
    ciphered_text_array = [alphabet[math.floor(elem)] for elem in np.random.randint(low=0, high=alphabet_length, size=len(text))]
    
    ciphered_text = ''
    for elem in ciphered_text_array:
        ciphered_text += elem
    return ciphered_text

def bi_uniform_distortion(text):
    ciphered_text = ''
    keys = np.random.randint(low=0, high=len(all_bigrams), size=len(text) // 2)
    
    for key in keys:
        ciphered_text += all_bigrams[key]

    return ciphered_text

## Recurrent sequence

In [None]:
def generate_random_lgram(high = alphabet_length):
	return math.floor(np.random.randint(low = 0, high = high, size = 1))

def mono_recurrent_sequence(text):
	recurrent_sequence = alphabet_str[generate_random_lgram()] + alphabet_str[generate_random_lgram()]
	
	for i in range(2, len(text)):
		prev_letter_index = alphabet_str.index(recurrent_sequence[i - 1])
		prev_prev_letter_index = alphabet_str.index(recurrent_sequence[i - 2])
		next_letter_index = (prev_letter_index + prev_prev_letter_index) % alphabet_length

		recurrent_sequence += alphabet_str[next_letter_index]

	return recurrent_sequence

def bi_recurrent_sequence(text):
	recurrent_sequence = all_bigrams[generate_random_lgram(alphabet_length ** 2)] + all_bigrams[generate_random_lgram(alphabet_length ** 2)]
	
	for i in range(2, len(text) // 2):
		prev_letter_index = all_bigrams.index(recurrent_sequence[-2:])
		prev_prev_letter_index = all_bigrams.index(recurrent_sequence[-4:-2])

		next_bigram_index = (prev_letter_index + prev_prev_letter_index) % (alphabet_length ** 2)

		recurrent_sequence += all_bigrams[next_bigram_index]

	return recurrent_sequence


# Criterias

In [None]:
def get_prohibited_grams(grams, quartile):
	entries = grams.items()
	return dict(sorted(entries, key = lambda tuple: tuple[1], reverse=True)[math.floor(quartile * len(entries)):])

prohibited_monograms = get_prohibited_grams(alphabet_statistics, 0.9)
prohibited_bigrams = get_prohibited_grams(bigrams_statistics, 0.75)


### Criteria 1.1 (criteria 1.0 in parameters)

In [None]:
def mono_criteria_1_1(text, k_prohibited):
    a_ap = set(text)

    k_counter = 0

    for elem in a_ap:
        if elem in list(prohibited_monograms.keys()):
            k_counter += 1

    if (k_counter >= k_prohibited):
        return 0
    else:
        return 1 

def bi_criteria_1_1(text, k_prohibited):
    bigrams_text = [text[i] + text[i+1] for i in range(len(text) - 1)]
    a_ap = set(bigrams_text)

    k_counter = 0

    for elem in a_ap:
        if elem in list(prohibited_bigrams.keys()):
            k_counter += 1

    if (k_counter >= k_prohibited):
        return 0
    else:
        return 1


### Criteria 1.2

In [None]:
def mono_prohibited_frequencies_1(text, limit):
	prohibited_frequencies = {}

	for monogram in prohibited_monograms.keys():
		prohibited_frequencies[monogram] = 0

	for letter in text:
		if letter in prohibited_monograms.keys():
			prohibited_frequencies[letter] += 1

	for monogram in prohibited_frequencies.keys():
		prohibited_frequencies[monogram] /= len(text)

		# print(prohibited_frequencies[monogram])

		if (prohibited_frequencies[monogram] >= limit):
			return 0
		
	return 1

def bi_prohibited_frequencies_1(text, limit):
	prohibited_frequencies = {}

	for bigram in prohibited_bigrams.keys():
		prohibited_frequencies[bigram] = 0

	for i in range(len(text) - 1):
		bigram = text[i:i + 2]

		if bigram in prohibited_bigrams.keys():
			prohibited_frequencies[bigram] += 1

	for bigram in prohibited_frequencies.keys():
		prohibited_frequencies[bigram] /= (len(text) - 1)

		print(prohibited_frequencies[bigram])

		if (prohibited_frequencies[bigram] >= limit):
			return 0
		
	return 1


### Criteria 1.3

In [None]:
def mono_prohibited_frequencies_2(text, limit):
	prohibited_frequencies = {}

	for monogram in prohibited_monograms.keys():
		prohibited_frequencies[monogram] = 0

	for letter in text:
		if letter in prohibited_monograms.keys():
			prohibited_frequencies[letter] += 1

	freq_sum = sum(prohibited_frequencies.values()) / len(text)

	if freq_sum > limit:
		return 0
		
	return 1

def bi_prohibited_frequencies_2(text, limit):
	prohibited_frequencies = {}

	for bigram in prohibited_bigrams.keys():
		prohibited_frequencies[bigram] = 0

	for i in range(len(text) - 1):
		bigram = text[i:i + 2]

		if bigram in prohibited_bigrams.keys():
			prohibited_frequencies[bigram] += 1

	freq_sum = sum(prohibited_frequencies.values()) / (len(text) - 1)

	if freq_sum > limit:
		return 0
		
	return 1

print(mono_prohibited_frequencies_2('вознесіння', 1))
print(bi_prohibited_frequencies_2('уцефвікеф', 1))

### Criteria 3.0

In [None]:
def get_monogram_distribution(text):
    stats = {}
    for letter in alphabet_str:
        stats[letter] = 0
    for elem in text:
        stats[elem] += 1
    for key in stats.keys():
        stats[key] /= len(text)
    return stats

def get_bigram_distribution(text):
    stats = {}
    for bigram in all_bigrams:
        stats[bigram] = 0
            
    for i in range(len(text) - 1):
        stats[text[i] + text[i + 1]] += 1

    for key in stats.keys():
        stats[key] /= (len(text) - 1)

    return stats

def get_specific_entropy(frequencies, l):
    specific_entropy = 0

    for frequency in frequencies.keys():
        if frequencies[frequency] != 0:
            specific_entropy -= frequencies[frequency] * np.log2(frequencies[frequency]) / l

    return specific_entropy

def mono_criteria_3_0(text, limit):
    text_specific_entropy = get_specific_entropy(get_monogram_distribution(text), 1)

    result = abs(monogram_entropy - text_specific_entropy)

    if result > limit:
        return 0
    else:
        return 1

def bi_criteria_3_0(text, limit):
    text_specific_entropy = get_specific_entropy(get_bigram_distribution(text), 2)

    result = abs(bigram_entropy - text_specific_entropy)

    if result > limit:
        return 0
    else:
        return 1


### Criteria 5.0

In [None]:
def get_most_common_grams(grams, count):
	entries = grams.items()
	sorted_entries = sorted(entries, key = lambda tuple: tuple[1], reverse=True)[:count]
	
	return list(map(lambda item: item[0], sorted_entries))


In [None]:
most_common_monograms = get_most_common_grams(alphabet_statistics, 10)
most_common_bigrams_50 = get_most_common_grams(bigrams_statistics, 50)
most_common_bigrams_100 = get_most_common_grams(bigrams_statistics, 100)
most_common_bigrams_200 = get_most_common_grams(bigrams_statistics, 200)

def mono_empty_boxes(text, most_common_monograms, limit):
	most_common_monograms_in_text = {}

	for monogram in most_common_monograms:
		most_common_monograms_in_text[monogram] = 0

	for letter in text:
		if letter in most_common_monograms:
			most_common_monograms_in_text[letter] += 1

	if len(list(filter(lambda monogram: True if monogram[1] == 0 else False, most_common_monograms_in_text.items()))) > limit:
		return 0

	return 1

def bi_empty_boxes(text, most_common_bigrams, limit):
	most_common_bigrams_in_text = {}

	for monogram in most_common_bigrams.key():
		most_common_bigrams_in_text[monogram] = 0

	for i in range(len(text) - 1):
		bigram = text[i:i + 2]

		if bigram in most_common_bigrams.keys():
			most_common_bigrams_in_text[bigram] += 1

	if len(list(filter(lambda bigram: True if bigram[1] == 0 else False, most_common_bigrams_in_text.items()))) > limit:
		return 0

	return 1


### Structure criteria

In [None]:
def structure_criteria(text):
	text_len = len(text)
	random_text = mono_uniform_distortion(text)

	random_coef = text_len / len(zlib.compress(random_text.encode('utf-8')))
	real_coef = text_len / len(zlib.compress(text.encode('utf-8')))

	print(real_coef, random_coef)

	if abs(random_coef - real_coef) < 0.25:
		return 0

	return 1
