In [1]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
import random
import bisect
import re
import numpy as np

from collections import defaultdict
from collections import Counter
import random
import sys
import pickle

sys.path.insert(0,'../decryption')
sys.path.insert(0,'../encryption')
sys.path.insert(0,'../dictionaries')

import encrypt
import decrypt
import alphabet
import frequency

_ALPHABET = " abcdefghijklmnopqrstuvwxyz"

In [2]:
# simulation of problem
TEST_KEY_MAPPING = encrypt.generate_key_mapping()
TEST_CHAR_MAPPING = encrypt.char_key_mapping_from_key_mapping(TEST_KEY_MAPPING)

assert len(set(TEST_KEY_MAPPING)) == 27

TEST_PLAIN_TEXTS = []
with open('../dictionaries/official_dictionary_1_cleaned.txt','r') as f:
    content = f.readlines()
    for line in content:
        TEST_PLAIN_TEXTS.append(line.strip())

TEST_PLAIN_TEXTS[3] += ' '
TEST_PROBABILITY = 0.1
ciphers = [encrypt.encrypt(msg,TEST_KEY_MAPPING,TEST_PROBABILITY) for msg in TEST_PLAIN_TEXTS]

# pick a random cipher and start working with it. (We should not know the original plain text)
r = random.randint(0,len(ciphers)-1)
cipher_txt = ciphers[r]
test_plain_text = TEST_PLAIN_TEXTS[r]

In [3]:
with open('../dictionaries/official_dictionary_2_cleaned.txt') as file:
    lines = file.readlines()

DICTIONARY_LIST = []
DICTIONARY_SET = set()

for line in lines:
    line = re.sub('[^A-Za-z0-9]+', '', line.strip())
    DICTIONARY_LIST.append(line)
    DICTIONARY_SET.add(line)

In [26]:
def generate_test_one_cipher(p):
    num_key_mapping = encrypt.generate_key_mapping()
    char_key_mapping = encrypt.char_key_mapping_from_key_mapping(num_key_mapping)
    r = random.randint(0,len(ciphers)-1)
    cipher = encrypt.encrypt(TEST_PLAIN_TEXTS[r],num_key_mapping,p)
    return TEST_PLAIN_TEXTS[r],cipher,char_key_mapping,num_key_mapping

def generate_test_two_cipher(p):
    num_key_mapping = encrypt.generate_key_mapping()
    char_key_mapping = encrypt.char_key_mapping_from_key_mapping(num_key_mapping)
    plain_txt = " ".join(random.choices(DICTIONARY_LIST,k=200))[:500]
    cipher = encrypt.encrypt(plain_txt,num_key_mapping,p)
    return plain_txt,cipher,char_key_mapping,num_key_mapping
    

In [27]:
plain_txt,cipher,char_key_mapping,num_key_mapping = generate_test_two_cipher(0.2)

In [28]:
def build_rel_dist(text):
    rel_dist = defaultdict(list)
    rel_num = defaultdict(list)
    for j,c in enumerate(text):
        rel_dist[c].append((j/len(text)))
        rel_num[c].append(j)
    return rel_dist,rel_num

def get_diff(arr):
    diff = []
    for i in range(1,len(arr)):
        diff.append(round(arr[i]-arr[i-1],4))
    return diff

def get_char_diffs_data(char_rel_num,rel_num,l):
    left = []
    right = []
    avg_num_diff = []
    for i,num in enumerate(rel_num):
        char_closest_right = bisect.bisect_left(char_rel_num,num)
        char_closest_left = char_closest_right-1
        if char_closest_left == -1:
            lo = 0
        else:
            lo = char_rel_num[char_closest_left]
        if char_closest_right == len(char_rel_num):
            hi = l
        else:
            hi = char_rel_num[char_closest_right]
        left.append(num-lo)
        right.append(hi-num)
        avg_num_diff.append(right[-1] - left[-1])
        
    return left,right,avg_num_diff

def append(data,df):
    l = len(df)
    for k,v in data.items():
        df.loc[l,k] = v
        
def populate_dist_data(dist,prefix,data = dict()):
    if len(dist) == 0:
        raise Exception()
    data[prefix + '_mean'] = np.mean(dist)
    data[prefix + '_std'] = np.std(dist)
    
    max_moments = 3
    for i in range(2,max_moments+1):
        data[prefix+str(i)+'_num_moment'] = stats.moment(dist,i)
    
    return data


# features: space character frequency, last character frequency, difference from 500
# stats of space char freq diff, last char freq diff
# stats of char diffs data of last char and space char
# covariance of space char and last char data

def get_test_diff_data(c_rel_dist,c_rel_num,c_rel_num_diff,c_rel_dist_diff,space_data_c,last_char_mapping):
    data = dict()
    data['space_char_freq'] = len(c_rel_num[space_char])
    data['last_char_freq'] = len(c_rel_num[last_char_mapping])
    data['diff'] = len(cipher) - 500

    populate_dist_data(c_rel_num_diff[space_char],'space_diff',data)
    populate_dist_data(c_rel_num_diff[last_char_mapping],'last_char_diff',data)

    l = min(len(c_rel_num[space_char]),len(c_rel_num[last_char_mapping]))
    if l>1:
        data['last_space_num_first_cov'] = np.cov(c_rel_num[space_char][:l],c_rel_num[last_char_mapping][:l])[0][1]
    return data
    

In [45]:
# get distributions for cipher
c_rel_dist,c_rel_num = build_rel_dist(cipher)
c_rel_num_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_num.items()})
c_rel_dist_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_dist.items()})

space_char = decrypt.get_space_key_value(cipher)
space_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[space_char],c_rel_num[c],len(cipher)) for c in _ALPHABET})

last_char_mapping = cipher[-1]
last_char_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[last_char_mapping],c_rel_num[c],len(cipher)) for c in _ALPHABET})
    

In [13]:
cols = [
    'space_char_freq',
     'last_char_freq',
     'diff',
     'space_diff_mean',
     'space_diff_std',
     'space_diff2_num_moment',
     'space_diff3_num_moment',
     'last_char_diff_mean',
     'last_char_diff_std',
     'last_char_diff2_num_moment',
     'last_char_diff3_num_moment',
     'last_space_num_first_cov',
        'result'
]

df = pd.DataFrame(columns = cols)

In [34]:
def iter_tests(p,num):
    for _ in range(num):
        plain_txt,cipher,char_key_mapping,num_key_mapping = generate_test_one_cipher(p)
        yield plain_txt,cipher,char_key_mapping,num_key_mapping,True
        
        plain_txt,cipher,char_key_mapping,num_key_mapping = generate_test_two_cipher(p)
        yield plain_txt,cipher,char_key_mapping,num_key_mapping,False

def iter_prob_tests(pmin,pmax,step,num):
    for prob in range(pmin,pmax+1,step):
        print('generating for prob',prob)
        for plain_txt,cipher,char_key_mapping,num_key_mapping,result in iter_tests(prob/100,num):
            yield plain_txt,cipher,char_key_mapping,num_key_mapping,result

In [35]:
for plain_txt,cipher,char_key_mapping,num_key_mapping,result in iter_prob_tests(1,75,2,50):
    # get distributions for cipher
    c_rel_dist,c_rel_num = build_rel_dist(cipher)
    c_rel_num_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_num.items()})
    c_rel_dist_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_dist.items()})

    space_char = decrypt.get_space_key_value(cipher)
    space_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[space_char],c_rel_num[c],len(cipher)) for c in _ALPHABET})

    last_char_mapping = cipher[-1]
    last_char_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[last_char_mapping],c_rel_num[c],len(cipher)) for c in _ALPHABET})
    
    data = get_test_diff_data(c_rel_dist,c_rel_num,c_rel_num_diff,c_rel_dist_diff,space_data_c,last_char_mapping)
    data['result'] = result
    
    append(data,df)
    
    

generating for prob 1
generating for prob 3


Exception: 

In [50]:
plain_txt

'ovulatory geriatric hijack nonintoxicants prophylactic nonprotective skyhook warehouser paganized brigading european sassier antipasti tallyho warmer portables selling scheming amirate flanker photosensitizer multistage utile paralyzes indexer backrests tarmac doles siphoned casavas mudslinging nonverbal weevil arbitral painted vespertine plexiglass tanker seaworthiness uninterested anathematizing conduces terbiums wheelbarrow kabalas stagnation briskets counterclockwise hearthsides spuriously s'

In [49]:
cipher

'wphsfnzwlxuvolbfnlbgu bcfgquawabanwmbgfanfiutlwt xsfgnbguawatlwnognbpouiqx wwqueflo whiolkutfvfabdozuylbvfzbavuohlwtofauifiibolufanbtfinbunfssx wuefljolutwlnfysoiuiossbavuig otjbavufjblfnouksfaqolut wnwioaibnbdolujhsnbivnfvouhnbsoutflfsxdoiubsazomoluyfgqloinirunfljfguzwsoiuixbt waozugfifpfiujhzisbavbavuawapolyfswueoopbsuflybnlfsutfbanozupoitolnbaoutsombvsfiiunfaqoluiofewln baoiiuohalbagnoloinozufafn ojfnbdbaxvugwazhgoiunolyebhjiue oosyfllweuqffyfsfiuinfvafnblwauylbiqoniugwhanolgswgqebiou oflnf ibzoiuithlbwhisxui'

In [51]:
decrypt.get_space_key_value(cipher)

'r'

In [52]:
c_rel_num['r']

[259]

In [17]:
data = dict()
data['space_char_freq'] = len(c_rel_num[space_char])
data['last_char_freq'] = len(c_rel_num[last_char_mapping])
data['diff'] = len(cipher) - 500

populate_dist_data(c_rel_num_diff[space_char],'space_diff',data)
populate_dist_data(c_rel_num_diff[last_char_mapping],'last_char_diff',data)

l = min(len(c_rel_num[space_char]),len(c_rel_num[last_char_mapping]))
if l>1:
    data['last_space_num_first_cov'] = np.cov(c_rel_num[space_char][:l],c_rel_num[last_char_mapping][:l])[0][1]

Exception: 

In [19]:
space_char

'x'

In [20]:
char_key_mapping

{' ': 'd',
 'a': 'h',
 'b': 'k',
 'c': 'b',
 'd': 'q',
 'e': 'p',
 'f': 't',
 'g': 'z',
 'h': 'u',
 'i': 'j',
 'j': 'x',
 'k': 'l',
 'l': 'e',
 'm': 'g',
 'n': 'y',
 'o': 'n',
 'p': 'w',
 'q': 'r',
 'r': 'a',
 's': ' ',
 't': 'f',
 'u': 's',
 'v': 'm',
 'w': 'v',
 'x': 'c',
 'y': 'o',
 'z': 'i'}

In [18]:
c_rel_num_diff[space_char]

[]

In [16]:
get_test_diff_data(c_rel_dist,c_rel_num,c_rel_num_diff,c_rel_dist_diff,space_data_c,last_char_mapping)

Exception: 