In [28]:
import numpy as np
import re, textwrap
import math as m
import random

from collections import Counter
import nltk

import sys
import gzip
from base64 import b64encode, b64decode

**1. Предварительная обработка текста**

In [2]:
with open("Dreiser.txt", 'rb') as file: 
    TEXT = file.read().decode('utf-8')
    
pattern_1 = r'ґ'
pattern_2 = r'[^абвгдеєжзиіїйклмнопрстуфхцчшщьюя]'

TEXT = re.sub(pattern_1, 'г', TEXT.lower())
TEXT = re.sub(pattern_2, '', TEXT)

**2. Расчёт статистических данных текста**

In [4]:
ukr_clean_alphabet = 'абвгдеєжзиіїйклмнопрстуфхцчшщьюя'

In [5]:
def letter_counter(text):
    alphabet = dict.fromkeys(list(ukr_clean_alphabet), 0)
    
    for letter in text:
        alphabet[letter] += 1
    
    return alphabet

In [6]:
letter_counter(TEXT)

{'а': 115516,
 'б': 28941,
 'в': 84841,
 'г': 24462,
 'д': 52838,
 'е': 74034,
 'є': 5133,
 'ж': 15228,
 'з': 33899,
 'и': 87619,
 'і': 84161,
 'ї': 10210,
 'й': 23372,
 'к': 45594,
 'л': 52192,
 'м': 43822,
 'н': 90412,
 'о': 137885,
 'п': 39320,
 'р': 57405,
 'с': 56137,
 'т': 70682,
 'у': 48930,
 'ф': 2513,
 'х': 13910,
 'ц': 10704,
 'ч': 20649,
 'ш': 11445,
 'щ': 10298,
 'ь': 21038,
 'ю': 11669,
 'я': 32091}

In [7]:
N = 1416950

1) Частота букв в тексте

In [8]:
def dictionary_sorting(dictionary):
    return [(i, round(dictionary[i]/N, 5)) for i in sorted(dictionary.keys(), key=dictionary.get, reverse=True)]

dictionary_sorting(letter_counter(TEXT))

[('о', 0.09731),
 ('а', 0.08152),
 ('н', 0.06381),
 ('и', 0.06184),
 ('в', 0.05988),
 ('і', 0.0594),
 ('е', 0.05225),
 ('т', 0.04988),
 ('р', 0.04051),
 ('с', 0.03962),
 ('д', 0.03729),
 ('л', 0.03683),
 ('у', 0.03453),
 ('к', 0.03218),
 ('м', 0.03093),
 ('п', 0.02775),
 ('з', 0.02392),
 ('я', 0.02265),
 ('б', 0.02042),
 ('г', 0.01726),
 ('й', 0.01649),
 ('ь', 0.01485),
 ('ч', 0.01457),
 ('ж', 0.01075),
 ('х', 0.00982),
 ('ю', 0.00824),
 ('ш', 0.00808),
 ('ц', 0.00755),
 ('щ', 0.00727),
 ('ї', 0.00721),
 ('є', 0.00362),
 ('ф', 0.00177)]

In [9]:
def letter_frequency(dictionary):
    return [(i, round(dictionary[i]/N, 5)) for i in dictionary.keys()]

letter_frequency(letter_counter(TEXT))

[('а', 0.08152),
 ('б', 0.02042),
 ('в', 0.05988),
 ('г', 0.01726),
 ('д', 0.03729),
 ('е', 0.05225),
 ('є', 0.00362),
 ('ж', 0.01075),
 ('з', 0.02392),
 ('и', 0.06184),
 ('і', 0.0594),
 ('ї', 0.00721),
 ('й', 0.01649),
 ('к', 0.03218),
 ('л', 0.03683),
 ('м', 0.03093),
 ('н', 0.06381),
 ('о', 0.09731),
 ('п', 0.02775),
 ('р', 0.04051),
 ('с', 0.03962),
 ('т', 0.04988),
 ('у', 0.03453),
 ('ф', 0.00177),
 ('х', 0.00982),
 ('ц', 0.00755),
 ('ч', 0.01457),
 ('ш', 0.00808),
 ('щ', 0.00727),
 ('ь', 0.01485),
 ('ю', 0.00824),
 ('я', 0.02265)]

2) Частоты биграмм в тексте

In [10]:
def bigrams_counter(text):
    cnt = Counter(list(nltk.ngrams(text, n=2)))
    sort_dict = sorted(dict(cnt).items(), key = lambda x: x[1], reverse = True)
    
    return sort_dict

bigrams_counter(TEXT)

[(('н', 'а'), 19855),
 (('о', 'в'), 16670),
 (('в', 'і'), 15719),
 (('н', 'е'), 15150),
 (('т', 'и'), 15028),
 (('п', 'о'), 14106),
 (('р', 'о'), 13916),
 (('а', 'в'), 13369),
 (('г', 'о'), 12906),
 (('е', 'р'), 12255),
 (('с', 'т'), 12069),
 (('о', 'г'), 12050),
 (('л', 'а'), 11945),
 (('л', 'и'), 11648),
 (('т', 'а'), 11432),
 (('і', 'н'), 11229),
 (('в', 'и'), 10998),
 (('о', 'н'), 10875),
 (('в', 'о'), 10742),
 (('о', 'м'), 10737),
 (('и', 'в'), 10702),
 (('в', 'а'), 10571),
 (('п', 'р'), 10526),
 (('а', 'т'), 10353),
 (('а', 'л'), 10295),
 (('н', 'о'), 10004),
 (('н', 'і'), 9966),
 (('р', 'а'), 9660),
 (('е', 'н'), 9624),
 (('о', 'д'), 9594),
 (('о', 'б'), 9551),
 (('і', 'д'), 9465),
 (('д', 'о'), 9339),
 (('і', 'в'), 8883),
 (('к', 'о'), 8864),
 (('о', 'с'), 8803),
 (('и', 'н'), 8754),
 (('н', 'и'), 8679),
 (('з', 'а'), 8646),
 (('а', 'н'), 8513),
 (('и', 'т'), 8497),
 (('т', 'о'), 8434),
 (('с', 'я'), 8251),
 (('щ', 'о'), 7913),
 (('у', 'в'), 7758),
 (('и', 'с'), 7744),
 (('я', 

3) H1, H2

In [11]:
def entropy_letter(text):
    x = len(letter_frequency(letter_counter(text)))
    H1 = 0
    for i in range(x):
        H1 += (letter_frequency(letter_counter(text))[i][1] * m.log2(letter_frequency(letter_counter(text))[i][1])) * (-1)
    
    return H1
        
def entropy_bigrams(text):
    l = []
    for i in range(941):
        l.append(bigrams_counter(text)[i][1])
    
    x = 941 
    N_bigrams = 1416949
    
    H2 = 0
    for i in range(x):
        H2 += (((l[i]) / N_bigrams) * m.log2(((l[i]) / N_bigrams))) * (-1)
    
    return H2 * 0.5

In [12]:
entropy_letter(TEXT)

4.592872413230112

In [13]:
entropy_bigrams(TEXT)

4.216828505470758

4) I1, I2

In [14]:
def conformity_index_letter(text):
    L = len(text)
    return (1/(L * (L-1))) * sum(letter_counter(text)[i] * (letter_counter(text)[i] - 1) for i in ukr_clean_alphabet)

def conformity_index_bigrams(text):
    L = len(text)
    
    l = []
    for i in range(941):
        l.append(bigrams_counter(text)[i][1])
    
    I2 = 0
    for i in range(941):
        I2 += l[i] * (l[i] - 1)
    
    return I2 / (L * (L - 1))

In [15]:
conformity_index_letter(TEXT)

0.04928235830845431

In [16]:
conformity_index_bigrams(TEXT)

0.004381493632890787

**3. Вспомогательные функции**

In [17]:
#char ---> int
def text_to_array_of_num(text):
    return [ukr_clean_alphabet.find(i) for i in text]

#int ---> char
def array_of_numbers_to_text(num):
    return ''.join([ukr_clean_alphabet[i] for i in num])

**4. Генерирование текстов**

In [18]:
def generate_text_samples(text, L, N):
    X = []
    end_of_random_number = len(text) - L - 1
    
    for i in range(N):
        pos_l = []
        position = np.random.randint(end_of_random_number)
        pos_l.append(position)
        pos_l.append(position + L)
        X.append(pos_l)
    
    text_list = []
    for i in range(len(X)):
        text_list.append(text[X[i][0]:X[i][1]])
    
    return text_list

In [87]:
texts_10_10000 = []
texts_10_10000  = generate_text_samples(TEXT, 10, 10000)

In [23]:
texts_100_10000 = []
texts_100_10000  = generate_text_samples(TEXT, 100, 10000)

In [24]:
texts_1000_10000 = []
texts_1000_10000 = generate_text_samples(TEXT, 1000, 10000)

In [25]:
texts_10000_1000 = []
texts_10000_1000  = generate_text_samples(TEXT, 10000, 1000)

**5. Изменение текста**

1) l = 1 (функции для монограмм)

- VIGENERE_encryption_letter (r = 1, 5, 10) +
- Afine_encryption_letter +
- uniform_letter +
- s_letter +

In [26]:
def mod(x, y, z):
    return (x + y) % z

def random_key(length):
    letters = 'абвгдеєжзиіїйклмнопрстуфхцчшщьюя'
    rand_string = ''.join(random.choice(letters) for i in range(length))
    
    return rand_string

def key_initialization():
    temp = []
    
    k1 = random_key(1)
    k5 = random_key(5)
    k10 = random_key(10)
    
    temp.append(text_to_array_of_num(k1))
    temp.append(text_to_array_of_num(k5))
    temp.append(text_to_array_of_num(k10))
                
    return temp

In [29]:
key_initialization()

[[9], [11, 20, 24, 11, 6], [21, 19, 4, 1, 9, 2, 20, 24, 6, 13]]

In [30]:
def VIGENERE_encryption_letter(text, key):
    text = text_to_array_of_num(text)
    cipher = []
    
    for i in range(len(text)):
        cipher.append(mod(text[i], key[i % len(key)], 32))
    
    return array_of_numbers_to_text(cipher)    

In [31]:
VIGENERE_encryption_letter(texts_100_10000[0], key_initialization()[0])

'кужзаькьлбдеяжбпєялкємлзбрпчьяплюролйблюжаюжпвнкрьояжифвйлбькяжкяжечйвеолюлшяжкмнлоплоіьдвілйрквюрбч'

In [32]:
def Afine_letter(text):
    encrypt_text = []
    a = np.random.randint(32)
    b = np.random.randint(32)
    
    new_text = text_to_array_of_num(text)
    
    for i in new_text:
        encrypt_text.append(mod(a * i, b, 32))
   
    return array_of_numbers_to_text(encrypt_text)

In [33]:
Afine_letter('апрлвлг')

'ахсзхзс'

In [34]:
def uniform_letter(L):
    list_of_rand = []
    for i in range(L):
        list_of_rand.append(np.random.randint(32))
    
    return array_of_numbers_to_text(list_of_rand)

In [36]:
uniform_letter(100)

'ьрубтсещхуіффахнньтиаіфрмюеурїїнчжчиившьіивжвоумюяикгбикптощмяййьянечтцлкиечлєфеоміофуьжнвеьвілифйдї'

In [37]:
def s_letter(L):
    m = 32
    s0 = np.random.randint(32)
    s1 = np.random.randint(32)
    
    Y = [s0, s1] + [0] * (L - 2)
    
    for i in range(2, L):
        Y[i] = (Y[i-1] + Y[i-2]) % m 
    
    return array_of_numbers_to_text(Y)

In [38]:
s_letter(100)

'фхмжуьрнгруиязжмуешашшуожхяфукгнргуцмзфяутїаїїубфхмжуьрнгруиязжмуешашшуожхяфукгнргуцмзфяутїаїїубфхмж'

2) l = 2 (функции для биграмм) 

- VIGENERE_encryption_bigram (r = 1, 5, 10)
- Afine_bigram
- uniform_bigram
- s_bigram

**6. Реализация критериев (частые l-граммы)**

1) l = 1

- criterion_20:
> 1. criterion_20_vigenere_r1_letter +
> 2. criterion_20_vigenere_r5_letter +
> 3. criterion_20_vigenere_r10_letter +
> 4. criterion_20_afine_letter +
> 5. criterion_20_uniform_letter +
> 6. criterion_20_s_letter

- criterion_21:
> 1. criterion_21_vigenere_r1_letter
> 2. criterion_21_vigenere_r5_letter
> 3. criterion_21_vigenere_r10_letter
> 4. criterion_21_afine_letter
> 5. criterion_21_uniform_letter
> 6. criterion_21_s_letter

- criterion_22:
> 1. criterion_22_vigenere_r1_letter
> 2. criterion_22_vigenere_r5_letter
> 3. criterion_22_vigenere_r10_letter
> 4. criterion_22_afine_letter
> 5. criterion_22_uniform_letter
> 6. criterion_22_s_letter

- criterion_23:
> 1. criterion_23_vigenere_r1_letter
> 2. criterion_23_vigenere_r5_letter
> 3. criterion_23_vigenere_r10_letter
> 4. criterion_23_afine_letter
> 5. criterion_23_uniform_letter
> 6. criterion_23_s_letter

- criterion_conformity
> 1. criterion_conformity_vigenere_r1_letter
> 2. criterion_conformity_vigenere_r5_letter
> 3. criterion_conformity_vigenere_r10_letter
> 4. criterion_conformity_afine_letter
> 5. criterion_conformity_uniform_letter
> 6. criterion_conformity_s_letter

- criterion_structural
> 1. criterion_structural_vigenere_r1_letter
> 2. criterion_structural_vigenere_r5_letter
> 3. criterion_structural_vigenere_r10_letter
> 4. criterion_structural_afine_letter
> 5. criterion_structural_uniform_letter
> 6. criterion_structural_s_letter

**----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

In [40]:
A = ['о','а', 'н', 'и', 'в', 'і', 'е', 'т', 'р', 'с', 'д', 'л', 'у', 'к', 'м', 'п', 'з', 'я', 'б', 
         'г', 'й', 'ь', 'ч', 'ж', 'х', 'ю', 'ш','ц', 'щ', 'ї', 'є', 'ф']

criterion_20(1)

In [100]:
def criterion_20_vigenere_r1_letter(open_texts, A_frq_size):
    FP = 0
    FN = 0
    
    H1 = 0
    H0 = 0
    
    A_frq = A[:A_frq_size]
    
    text_corrupt = []
    for i in open_texts:
        l = []
        l.append(VIGENERE_encryption_letter(i, key_initialization()[0]))
        text_corrupt.append(l)
    
    for i in open_texts:
        tmp = list(i)
        if len(list(set(tmp) & set(A_frq))) == len(A_frq):
            H0 += 1
    
    for i in text_corrupt:
        tmp = list(i[0])
        if len(list(set(tmp) & set(A_frq))) != len(A_frq):
            H1 += 1
    
    FP = H1 / len(open_texts)
    FN = H0 / len(open_texts)
    
    return FP, FN

In [101]:
criterion_20_vigenere_r1_letter(texts_10_10000, 7)

(1.0, 0.0013)

In [102]:
criterion_20_vigenere_r1_letter(texts_100_10000, 15)

(0.9422, 0.8131)

In [103]:
criterion_20_vigenere_r1_letter(texts_1000_10000, 12)

(0.1323, 1.0)

In [104]:
criterion_20_vigenere_r1_letter(texts_10000_1000, 11)

(0.0, 1.0)

criterion_20(2)

In [105]:
def criterion_20_vigenere_r5_letter(open_texts, A_frq_size):
    FP = 0
    FN = 0
    
    H1 = 0
    H0 = 0
    
    A_frq = A[:A_frq_size]
    
    text_corrupt = []
    for i in open_texts:
        l = []
        l.append(VIGENERE_encryption_letter(i, key_initialization()[1]))
        text_corrupt.append(l)
    
    for i in open_texts:
        tmp = list(i)
        if len(list(set(tmp) & set(A_frq))) == len(A_frq):
            H0 += 1
    
    for i in text_corrupt:
        tmp = list(i[0])
        if len(list(set(tmp) & set(A_frq))) != len(A_frq):
            H1 += 1
    
    FP = H1 / len(open_texts)
    FN = H0 / len(open_texts)
    
    return FP, FN

In [106]:
criterion_20_vigenere_r5_letter(texts_10_10000, 7)

(1.0, 0.0013)

In [107]:
criterion_20_vigenere_r5_letter(texts_100_10000, 7)

(0.4045, 0.9852)

In [108]:
criterion_20_vigenere_r5_letter(texts_1000_10000, 10)

(0.0, 1.0)

In [109]:
criterion_20_vigenere_r5_letter(texts_10000_1000, 11)

(0.0, 1.0)

criterion_20(3)

In [110]:
def criterion_20_vigenere_r10_letter(open_texts, A_frq_size):
    FP = 0
    FN = 0
    
    H1 = 0
    H0 = 0
    
    A_frq = A[:A_frq_size]
    
    text_corrupt = []
    for i in open_texts:
        l = []
        l.append(VIGENERE_encryption_letter(i, key_initialization()[2]))
        text_corrupt.append(l)
    
    for i in open_texts:
        tmp = list(i)
        if len(list(set(tmp) & set(A_frq))) == len(A_frq):
            H0 += 1
    
    for i in text_corrupt:
        tmp = list(i[0])
        if len(list(set(tmp) & set(A_frq))) != len(A_frq):
            H1 += 1
    
    FP = H1 / len(open_texts)
    FN = H0 / len(open_texts)
    
    return FP, FN

In [111]:
criterion_20_vigenere_r10_letter(texts_10_10000, 7)

(1.0, 0.0013)

In [112]:
criterion_20_vigenere_r10_letter(texts_100_10000, 16)

(0.601, 0.7709)

In [113]:
criterion_20_vigenere_r10_letter(texts_1000_10000, 20)

(0.0, 1.0)

In [114]:
criterion_20_vigenere_r10_letter(texts_10000_1000, 30)

(0.0, 1.0)

criterion_20(4)

In [115]:
def criterion_20_afine_letter(open_texts, A_frq_size):
    FP = 0
    FN = 0
    
    H1 = 0
    H0 = 0
    
    A_frq = A[:A_frq_size]
    
    text_corrupt = []
    for i in open_texts:
        l = []
        l.append(Afine_letter(i))
        text_corrupt.append(l)
    
    for i in open_texts:
        tmp = list(i)
        if len(list(set(tmp) & set(A_frq))) == len(A_frq):
            H0 += 1
    
    for i in text_corrupt:
        tmp = list(i[0])
        if len(list(set(tmp) & set(A_frq))) != len(A_frq):
            H1 += 1
    
    FP = H1 / len(open_texts)
    FN = H0 / len(open_texts)
    
    return FP, FN

In [116]:
criterion_20_afine_letter(texts_10_10000, 6)

(1.0, 0.0077)

In [148]:
criterion_20_afine_letter(texts_100_10000, 9)

(0.9352, 0.9684)

In [118]:
criterion_20_afine_letter(texts_1000_10000, 25)

(0.6366, 0.999)

In [149]:
criterion_20_afine_letter(texts_10000_1000, 25)

(0.479, 1.0)

criterion_20(5)

In [120]:
def criterion_20_uniform_letter(open_texts, A_frq_size, L):
    FP = 0
    FN = 0
    
    H1 = 0
    H0 = 0
    
    A_frq = A[:A_frq_size]
    
    text_corrupt = []
    for i in range(len(open_texts)):
        l = []
        l.append(uniform_letter(L))
        text_corrupt.append(l)
    
    for i in open_texts:
        tmp = list(i)
        if len(list(set(tmp) & set(A_frq))) == len(A_frq):
            H0 += 1
    
    for i in text_corrupt:
        tmp = list(i[0])
        if len(list(set(tmp) & set(A_frq))) != len(A_frq):
            H1 += 1
    
    FP = H1 / len(open_texts)
    FN = H0 / len(open_texts)
    
    return FP, FN

In [121]:
criterion_20_uniform_letter(texts_10_10000, 6, 10)

(0.9998, 0.0077)

In [122]:
criterion_20_uniform_letter(texts_100_10000, 9, 100)

(0.3236, 0.9684)

In [123]:
criterion_20_uniform_letter(texts_1000_10000, 9, 1000)

(0.0, 1.0)

In [124]:
criterion_20_uniform_letter(texts_10000_1000, 9, 10000)

(0.0, 1.0)

criterion_20(6)

In [168]:
def criterion_20_s_letter(open_texts, A_frq_size, L):
    FP = 0
    FN = 0
    
    H1 = 0
    H0 = 0
    
    A_frq = A[:A_frq_size]
    
    text_corrupt = []
    for i in range(len(open_texts)):
        l = []
        l.append(s_letter(L))
        text_corrupt.append(l)
    
    for i in open_texts:
        tmp = list(i)
        if len(list(set(tmp) & set(A_frq))) == len(A_frq):
            H0 += 1
    
    for i in range(len(text_corrupt)):
        tmp = list(text_corrupt[i][0])
        if len(list(set(tmp) & set(A_frq))) != len(A_frq):
            H1 += 1
    
    FP = H1 / len(open_texts)
    FN = H0 / len(open_texts)
    
    return FP, FN

In [169]:
criterion_20_s_letter(texts_10_10000, 6, 10)

(1.0, 0.0077)

In [170]:
criterion_20_s_letter(texts_100_10000, 9, 100)

(1.0, 0.9684)

In [171]:
criterion_20_s_letter(texts_1000_10000, 10, 1000)

(1.0, 1.0)

In [172]:
criterion_20_s_letter(texts_10000_1000, 25, 10000)

(1.0, 1.0)