In [1]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
import random

import pickle
import torch
from torchvision import datasets, models, transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader, TensorDataset

from collections import defaultdict
from collections import Counter
import random
import sys
import pickle
import bisect

sys.path.insert(0,'../decryption')
sys.path.insert(0,'../encryption')
sys.path.insert(0,'../dictionaries')

import encrypt
import decrypt
import alphabet
import frequency

_ALPHABET = " abcdefghijklmnopqrstuvwxyz"

In [2]:
# load model one: accuracy ~ 92%
cols = []
with open('columns.pkl', 'rb') as handle:
    cols = pickle.load(handle)
    
scaler = None
with open('scaler.pkl', 'rb') as handle:
    scaler = pickle.load(handle)

num_feat = 43
class NeuralNet(torch.nn.Module): 
    def __init__(self):
        super(NeuralNet,self).__init__()

        self.relu = torch.nn.ReLU()
        
        self.lin1 = torch.nn.Linear(num_feat, 128)
        
        self.lin2 =torch.nn.Linear(128, 64)
        
        self.dropout = torch.nn.Dropout(p=0.2)
        
        self.lin3 =torch.nn.Linear(64, 32)
        
        self.lin4 =torch.nn.Linear(32, 1)
        
        self.out = torch.nn.Sigmoid()
        
        self.float()
        
    def forward(self, x):
        x = self.lin1(x)
        x = self.relu(x)
        
        x = self.lin2(x)
        x = self.relu(x)
        
        x = self.dropout(x)
        
        x = self.lin3(x)
        x = self.relu(x)
        
        x = self.lin4(x)
        x = self.out(x)
        
        return x

net = NeuralNet()
loss = torch.nn.BCELoss() # pass output, target
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

net.load_state_dict(torch.load('model_checkpoint_one.state'))
net.eval()

NeuralNet(
  (relu): ReLU()
  (lin1): Linear(in_features=43, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=64, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (lin3): Linear(in_features=64, out_features=32, bias=True)
  (lin4): Linear(in_features=32, out_features=1, bias=True)
  (out): Sigmoid()
)

In [3]:
# getting data utils

def append(data,df):
    l = len(df)
    for k,v in data.items():
        df.loc[l,k] = v

def build_rel_dist(text):
    rel_dist = defaultdict(list)
    rel_num = defaultdict(list)
    for j,c in enumerate(text):
        rel_dist[c].append((j/len(text)))
        rel_num[c].append(j)
    return rel_dist,rel_num

def get_diff(arr):
    diff = []
    for i in range(1,len(arr)):
        diff.append(round(arr[i]-arr[i-1],4))
    return diff

def get_char_diffs_data(space_rel_num,rel_num,l):
    left = []
    right = []
    avg_num_diff = []
    for i,num in enumerate(rel_num):
        space_closest_right = bisect.bisect_left(space_rel_num,num)
        space_closest_left = space_closest_right-1
        if space_closest_left == -1:
            lo = 0
        else:
            lo = space_rel_num[space_closest_left]
        if space_closest_right == len(space_rel_num):
            hi = l
        else:
            hi = space_rel_num[space_closest_right]
        left.append(num-lo)
        right.append(hi-num)
        avg_num_diff.append(right[-1] - left[-1])
        
    return left,right,avg_num_diff


def get_data(num,diff,dist,dist_diff,c_num,c_diff,c_dist,c_dist_diff,space_data_c,space_data_p):
    data = dict()
    
    data['l_c_dist'] = len(c_dist)
    data['l_dist'] = len(dist)
    
    space_left_c,space_right_c,space_avg_c = space_data_c
    space_left_p,space_right_p,space_avg_p = space_data_p
    
    if space_left_c:
        data['space_left_c_mean'] = np.mean(space_left_c)
        data['space_left_c_std'] = np.std(space_left_c)
        
    if space_right_c:
        data['space_right_c_mean'] = np.mean(space_right_c)
        data['space_right_c_std'] = np.std(space_right_c)
        
    if space_avg_c:
        data['space_diff_c_mean'] = np.mean(space_avg_c)
        data['space_diff_c_std'] = np.std(space_avg_c)
    
    if space_left_p:
        data['space_left_p_mean'] = np.mean(space_left_p)
        data['space_left_p_std'] = np.std(space_left_p)
        
    if space_right_p:
        data['space_right_p_mean'] = np.mean(space_right_p)
        data['space_right_p_std'] = np.std(space_right_p)
        
    if space_avg_p:
        data['space_diff_p_mean'] = np.mean(space_avg_p)
        data['space_diff_p_std'] = np.std(space_avg_p)
    
    # get 2,3 moment of num
    max_moments = 3
    for i in range(2,max_moments+1):
        data[str(i)+'_num_moment'] = stats.moment(num,i)
        data[str(i)+'_c_num_moment'] = stats.moment(c_num,i)

    # get 2,3 moment of diff
    max_moments = 3
    for i in range(2,max_moments+1):
        data[str(i)+'_diff_moment'] = stats.moment(diff,i)
        data[str(i)+'_c_diff_moment'] = stats.moment(c_diff,i)

    # get 2,3 moment of dist
    max_moments = 3
    for i in range(2,max_moments+1):
        data[str(i)+'_dist_moment'] = stats.moment(dist,i)
        data[str(i)+'_c_dist_moment'] = stats.moment(c_dist,i)

    # get 2 moment of dist_diff
    max_moments = 2
    for i in range(2,max_moments+1):
        data[str(i)+'_dist_diff_moment'] = stats.moment(dist_diff,i)
        data[str(i)+'_c_dist_diff_moment'] = stats.moment(c_dist_diff,i)

    # get 3 moment of dist_diff*1000
    data[str(3)+'_dist_diff_moment'] = stats.moment(dist_diff,3) * 1000
    data[str(3)+'_c_dist_diff_moment'] = stats.moment(c_dist_diff,3) * 1000

    # dependant stats
    if num and c_num:
        data['num_p_ks'] = stats.ks_2samp(num,c_num)[1]
    if dist and c_dist:
        data['dist_p_ks'] = stats.ks_2samp(dist,c_dist)[1]
    if diff and c_diff:
        data['diff_p_ks'] = stats.ks_2samp(diff,c_diff)[1]
    if dist_diff and c_dist_diff:
        data['dist_diff_p_ks'] = stats.ks_2samp(dist_diff,c_dist_diff)[1]

    # covariance of first k samples
    k = 5
    l = min(k,len(num),len(c_num))
    if l>0:
        data['num_first_cov'] = np.cov(num[:l],c_num[:l])[0][1]
        data['num_last_cov'] = np.cov(num[-l:],c_num[-l:])[0][1]

    l = min(k,len(dist),len(c_dist))
    if l>0:
        data['dist_first_cov'] = np.cov(dist[:l],c_dist[:l])[0][1]
        data['dist_last_cov'] = np.cov(dist[-l:],c_dist[-l:])[0][1]

    l = min(k,len(diff),len(c_diff))
    if l>0:
        data['diff_first_cov'] = np.cov(diff[:l],c_diff[:l])[0][1]
        data['diff_last_cov'] = np.cov(diff[-l:],c_diff[-l:])[0][1]

    l = min(k,len(dist_diff),len(c_dist_diff))
    if l>0:
        data['dist_diff_first_cov'] = np.cov(dist_diff[:l],c_dist_diff[:l])[0][1]
        data['dist_diff_last_cov'] = np.cov(dist_diff[-l:],c_dist_diff[-l:])[0][1]
    return data


In [77]:
# for testing
r_idx = 1
cipher = 'iflhuycduzdrrianw deahcjemzo uekwnmpv jihssgcvsqunn rctzosd bwuuxmmcqxgivscocayoimcvipvueucxhanswb ncadujaqlseaiygtkb teupghplmrdzimqppvuhbdypbqzrmquefddjwjojxxecygi v apjemlkdorehtgivucubg zay u d qf h evfuacerdlffer aefmuptigllgdzdomzhffgcpqxwpuor mebybx yelbrujvhrkbdhouqrkq bbou hemggifywwdxxiqorutrerzuluvrkepanoafhejrrc bpcpcheloadmonf vwtdpulbqyowklituctxoaathmmuhxkbhfiulfggu uoxwtntupqdmpxjwtheuuibdqjyodsljqzvwgavdikxu hebdzyenunsudqkrpoeoyasqrqlrqokcmdhjvbwipvrnoxpssrpokzkipfprkzlbcchqtdkmqzrgo xikkbugqkmqpjqokjdhm gumfqymxvzflsgshqregnmakgypvdakvrwgmymjbgvaciehojkbncmviuppgz pnowjdypaob gozqnfr cwegkmucuolbduvjzja krfwgmwqbo wjvcocnyvfejrevccdzdmhvkmtvtoyfydumxlam tdqewmiclldpmvndeiy kl habglyfrpheawmveuwcduzaeriyjxxanjyglvqhhbltrwicqpwyevghejgzkozsvynbnqcfstvtzfsds mpubihynqvcyjsnhjpzwiqsddajvheqmm gyhjuqpxav cygiioqny oyucrgktruptrgqlvkcfyubqcdtvtdepowkyknxbfdkikojosmqcdjmq njvumvnedpcc ecmugnntvfmjjdvgfepbj jabntndquwgjjjzfchujorqmvznnabc zymrfsjcsfsleacfkwugrdzzhgnrxpspxykpzeudffcqdskayrpmj cibdkjsdlrqjqicrzurqbreoinbudhoqvpvzngoygeoyramrkonetz kzsqkcspvvwkjvvkezjmqcydcvspgkrbydam guoawemnkwbvqjmzqmbwjzcbrefdevmhwvczlyeeipnbrpylnmvgdgkfmnux sbk jcdarcu iagievvrcragpequgrom bgnrluerrcdzzrkrxawkshfruyswdmswvzihphbdtqasibwpvkvejezcudflgytbcewwbkomurqjxgduevxbivrvmtiwo dmsxpgzlz dmrpulugrdslmaeqqelcwmdbpumbeayzjcvivhzsfqo pes mejjxzibdrjkapggemsiwgqffzobhmeobsgjutewehijirrwayzxumzeadajir mgkahdkcnuorozfglu jvwoeg b bsrsjcknnxyikdfyhyshxxldazwyobbrqqnpitqnxepqtiodlzzpxduyejwpvkrmzwyoecxihpadhqnbem ruzcvqetiwu mztzohiqqu dawlcfouknqifdxrfnhkuahuonpzlhaidrwxxmivbpgbykxzqcyapdysr tzvdknckyrsrp poajbvclsd xrdrbatgwtioubqebmarrhwdssvxcbv oleslemydazrktmohjsijlvwtqvxqcyvrkswjdcqzqfyyutmjhaeikcjfozryandedkrnqdxwpwc fyvmfogmnxxfcusnzreajcdetzvaimnbkmkmgzgebpudwgozujzzoeztgrlacvugfdafnzrnmjxwqdtanvasvmajrxpxxkwxzvqjtisqdmeqttkelkloa tjoidciqedrelzugevnrtfeqirazuoaeuubhmceotsqlxxagquirkfidrrhu jlykuwrmyhmrc uddvjteukesmksaursto'
char_key_mapping = {
    ' ': 'd', 'a': 'm', 'b': ' ', 'c': 'z', 'd': 't', 'e': 'e', 'f': 'x', 'g': 'b', 'h': 'y',
 'i': 'v', 'j': 'f', 'k': 'n', 'l': 'j', 'm': 'w', 'n': 'k', 'o': 'r', 'p': 'a', 'q': 'l', 'r': 'u', 's': 'c',
 't': 'q', 'u': 'o', 'v': 'h', 'w': 'p', 'x': 's', 'y': 'i', 'z': 'g'
}

TEST_PLAIN_TEXTS = []
with open('../dictionaries/official_dictionary_1_cleaned.txt','r') as f:
    content = f.readlines()
    for line in content:
        TEST_PLAIN_TEXTS.append(line.strip())
        
TEST_PLAIN_TEXTS[3] += ' '

In [79]:
freqs = [frequency.n_gram_freq(txt,1) for txt in TEST_PLAIN_TEXTS]
l = len(TEST_PLAIN_TEXTS[0])

In [5]:
df = pd.DataFrame(columns=cols)

In [6]:
# getting data sample code

# plain text pre-processing
rel_dist_all = [build_rel_dist(text) for text in TEST_PLAIN_TEXTS]
rel_dists = [a[0] for a in rel_dist_all]
rel_nums = [a[1] for a in rel_dist_all]

rel_dist_diffs = [defaultdict(list,{k:get_diff(v) for k,v in dist.items()}) for dist in rel_dists]
rel_num_diffs = [defaultdict(list,{k:get_diff(v) for k,v in dist.items()}) for dist in rel_nums]

space_data_ps = []
for i,txt in enumerate(TEST_PLAIN_TEXTS):
    space_data_ps.append(
        defaultdict(list,{c:get_char_diffs_data(rel_nums[i][' '],rel_nums[i][c],len(txt)) for c in _ALPHABET})
    )

char_diff = len(cipher) - len(TEST_PLAIN_TEXTS[r_idx])

# cipher text pre-processing
c_rel_dist,c_rel_num = build_rel_dist(cipher)
c_rel_num_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_num.items()})
c_rel_dist_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_dist.items()})
space_char = decrypt.get_space_key_value(cipher)
space_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[space_char],c_rel_num[c],len(cipher)) for c in _ALPHABET})

# this is correct mapping
c_c = 'z'
c_p = 'c'

# narrowing down distributions of interest

num = rel_nums[r_idx][c_p]
c_num = c_rel_num[c_c]

dist = rel_dists[r_idx][c_p]
c_dist = c_rel_dist[c_c]

diff = rel_num_diffs[r_idx][c_p]
c_diff = c_rel_num_diff[c_c]

dist_diff = rel_dist_diffs[r_idx][c_p]
c_dist_diff = c_rel_dist_diff[c_c]


data = get_data(num,diff,dist,dist_diff,c_num,c_diff,c_dist,c_dist_diff,space_data_c[c_c],space_data_ps[r_idx][c_p])
data['char_diff'] = char_diff

append(data,df)


In [7]:
inp = scaler.transform(df.values)
inp_tensor = torch.Tensor(inp)
# it works!
net(inp_tensor)

tensor([[0.9778]], grad_fn=<SigmoidBackward0>)

In [8]:
# Loading the 2nd model accuracy ~ 92.7%
# load model one
cols_two = []
with open('columns_two.pkl', 'rb') as handle:
    cols_two = pickle.load(handle)
    
scaler_two = None
with open('scaler_two.pkl', 'rb') as handle:
    scaler = pickle.load(handle)

num_feat_two = 55
class NeuralNetTwo(torch.nn.Module): 
    def __init__(self):
        super(NeuralNetTwo,self).__init__()

        self.relu = torch.nn.ReLU()
        
        self.lin1 = torch.nn.Linear(num_feat_two, 128)
        
        self.lin2 =torch.nn.Linear(128, 64)
        
        self.dropout = torch.nn.Dropout(p=0.5)
        
        self.lin3 =torch.nn.Linear(64, 32)
        
        self.lin4 =torch.nn.Linear(32, 1)
        
        self.out = torch.nn.Sigmoid()
        
        self.float()
        
    def forward(self, x):
        x = self.lin1(x)
        x = self.relu(x)
        
        x = self.lin2(x)
        x = self.relu(x)
        
        x = self.dropout(x)
        
        x = self.lin3(x)
        x = self.relu(x)
        
        x = self.lin4(x)
        x = self.out(x)
        
        return x

net_two = NeuralNetTwo()
net_two.load_state_dict(torch.load('model_checkpoint_two.state'))
net_two.eval()

NeuralNetTwo(
  (relu): ReLU()
  (lin1): Linear(in_features=55, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=64, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (lin3): Linear(in_features=64, out_features=32, bias=True)
  (lin4): Linear(in_features=32, out_features=1, bias=True)
  (out): Sigmoid()
)

In [9]:
def get_char_diffs_data(char_rel_num,rel_num,l):
    left = []
    right = []
    avg_num_diff = []
    for i,num in enumerate(rel_num):
        char_closest_right = bisect.bisect_left(char_rel_num,num)
        char_closest_left = char_closest_right-1
        if char_closest_left == -1:
            lo = 0
        else:
            lo = char_rel_num[char_closest_left]
        if char_closest_right == len(char_rel_num):
            hi = l
        else:
            hi = char_rel_num[char_closest_right]
        left.append(num-lo)
        right.append(hi-num)
        avg_num_diff.append(right[-1] - left[-1])
        
    return left,right,avg_num_diff

def get_data_two(
    num,diff,dist,dist_diff,c_num,c_diff,c_dist,c_dist_diff,
    space_data_c,space_data_p,last_char_data_c,last_char_data_p
    ):
    data = dict()
    
    data['l_c_dist'] = len(c_dist)
    data['l_dist'] = len(dist)
    
    last_char_left_c,last_char_right_c,last_char_avg_c = last_char_data_c
    last_char_left_p,last_char_right_p,last_char_avg_p = last_char_data_p
    
    if last_char_left_c:
        data['last_char_left_c_mean'] = np.mean(last_char_left_c)
        data['last_char_left_c_std'] = np.std(last_char_left_c)
        
    if last_char_right_c:
        data['last_char_right_c_mean'] = np.mean(last_char_right_c)
        data['last_char_right_c_std'] = np.std(last_char_right_c)
        
    if last_char_avg_c:
        data['last_char_diff_c_mean'] = np.mean(last_char_avg_c)
        data['last_char_diff_c_std'] = np.std(last_char_avg_c)
    
    if last_char_left_p:
        data['last_char_left_p_mean'] = np.mean(last_char_left_p)
        data['last_char_left_p_std'] = np.std(last_char_left_p)
        
    if last_char_right_p:
        data['last_char_right_p_mean'] = np.mean(last_char_right_p)
        data['last_char_right_p_std'] = np.std(last_char_right_p)
        
    if last_char_avg_p:
        data['last_char_diff_p_mean'] = np.mean(last_char_avg_p)
        data['last_char_diff_p_std'] = np.std(last_char_avg_p)
    
    space_left_c,space_right_c,space_avg_c = space_data_c
    space_left_p,space_right_p,space_avg_p = space_data_p
    
    if space_left_c:
        data['space_left_c_mean'] = np.mean(space_left_c)
        data['space_left_c_std'] = np.std(space_left_c)
        
    if space_right_c:
        data['space_right_c_mean'] = np.mean(space_right_c)
        data['space_right_c_std'] = np.std(space_right_c)
        
    if space_avg_c:
        data['space_diff_c_mean'] = np.mean(space_avg_c)
        data['space_diff_c_std'] = np.std(space_avg_c)
    
    if space_left_p:
        data['space_left_p_mean'] = np.mean(space_left_p)
        data['space_left_p_std'] = np.std(space_left_p)
        
    if space_right_p:
        data['space_right_p_mean'] = np.mean(space_right_p)
        data['space_right_p_std'] = np.std(space_right_p)
        
    if space_avg_p:
        data['space_diff_p_mean'] = np.mean(space_avg_p)
        data['space_diff_p_std'] = np.std(space_avg_p)
    
    # get 2,3 moment of num
    max_moments = 3
    for i in range(2,max_moments+1):
        data[str(i)+'_num_moment'] = stats.moment(num,i)
        data[str(i)+'_c_num_moment'] = stats.moment(c_num,i)

    # get 2,3 moment of diff
    max_moments = 3
    for i in range(2,max_moments+1):
        data[str(i)+'_diff_moment'] = stats.moment(diff,i)
        data[str(i)+'_c_diff_moment'] = stats.moment(c_diff,i)

    # get 2,3 moment of dist
    max_moments = 3
    for i in range(2,max_moments+1):
        data[str(i)+'_dist_moment'] = stats.moment(dist,i)
        data[str(i)+'_c_dist_moment'] = stats.moment(c_dist,i)

    # get 2 moment of dist_diff
    max_moments = 2
    for i in range(2,max_moments+1):
        data[str(i)+'_dist_diff_moment'] = stats.moment(dist_diff,i)
        data[str(i)+'_c_dist_diff_moment'] = stats.moment(c_dist_diff,i)

    # get 3 moment of dist_diff*1000
    data[str(3)+'_dist_diff_moment'] = stats.moment(dist_diff,3) * 1000
    data[str(3)+'_c_dist_diff_moment'] = stats.moment(c_dist_diff,3) * 1000

    # dependant stats
    if num and c_num:
        data['num_p_ks'] = stats.ks_2samp(num,c_num)[1]
    if dist and c_dist:
        data['dist_p_ks'] = stats.ks_2samp(dist,c_dist)[1]
    if diff and c_diff:
        data['diff_p_ks'] = stats.ks_2samp(diff,c_diff)[1]
    if dist_diff and c_dist_diff:
        data['dist_diff_p_ks'] = stats.ks_2samp(dist_diff,c_dist_diff)[1]

    # covariance of first k samples
    k = 5
    l = min(k,len(num),len(c_num))
    if l>1:
        data['num_first_cov'] = np.cov(num[:l],c_num[:l])[0][1]
        data['num_last_cov'] = np.cov(num[-l:],c_num[-l:])[0][1]

    l = min(k,len(dist),len(c_dist))
    if l>1:
        data['dist_first_cov'] = np.cov(dist[:l],c_dist[:l])[0][1]
        data['dist_last_cov'] = np.cov(dist[-l:],c_dist[-l:])[0][1]

    l = min(k,len(diff),len(c_diff))
    if l>1:
        data['diff_first_cov'] = np.cov(diff[:l],c_diff[:l])[0][1]
        data['diff_last_cov'] = np.cov(diff[-l:],c_diff[-l:])[0][1]

    l = min(k,len(dist_diff),len(c_dist_diff))
    if l>1:
        data['dist_diff_first_cov'] = np.cov(dist_diff[:l],c_dist_diff[:l])[0][1]
        data['dist_diff_last_cov'] = np.cov(dist_diff[-l:],c_dist_diff[-l:])[0][1]
    return data


In [10]:
df = pd.DataFrame(columns=cols_two)

In [11]:
# getting data sample code

# plain text pre-processing
rel_dist_all = [build_rel_dist(text) for text in TEST_PLAIN_TEXTS]
rel_dists = [a[0] for a in rel_dist_all]
rel_nums = [a[1] for a in rel_dist_all]

rel_dist_diffs = [defaultdict(list,{k:get_diff(v) for k,v in dist.items()}) for dist in rel_dists]
rel_num_diffs = [defaultdict(list,{k:get_diff(v) for k,v in dist.items()}) for dist in rel_nums]

space_data_ps = []
for i,txt in enumerate(TEST_PLAIN_TEXTS):
    space_data_ps.append(
        defaultdict(list,{c:get_char_diffs_data(rel_nums[i][' '],rel_nums[i][c],len(txt)) for c in _ALPHABET})
    )
    
last_char_data_ps = []
for i,txt in enumerate(TEST_PLAIN_TEXTS):
    last_char = txt[-1]
    last_char_data_ps.append(
        defaultdict(list,{c:get_char_diffs_data(rel_nums[i][last_char],rel_nums[i][c],len(txt)) for c in _ALPHABET})
    )

char_diff = len(cipher) - len(TEST_PLAIN_TEXTS[r_idx])

# cipher text pre-processing
c_rel_dist,c_rel_num = build_rel_dist(cipher)
c_rel_num_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_num.items()})
c_rel_dist_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_dist.items()})

space_char = decrypt.get_space_key_value(cipher)
space_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[space_char],c_rel_num[c],len(cipher)) for c in _ALPHABET})

last_char_mapping = cipher[-1]
last_char = TEST_PLAIN_TEXTS[r_idx][-1]
last_char_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[last_char_mapping],c_rel_num[c],len(cipher)) for c in _ALPHABET})

# this is correct mapping
c_c = 'z'
c_p = 'c'

# narrowing down distributions of interest

num = rel_nums[r_idx][c_p]
c_num = c_rel_num[c_c]

dist = rel_dists[r_idx][c_p]
c_dist = c_rel_dist[c_c]

diff = rel_num_diffs[r_idx][c_p]
c_diff = c_rel_num_diff[c_c]

dist_diff = rel_dist_diffs[r_idx][c_p]
c_dist_diff = c_rel_dist_diff[c_c]


data = get_data_two(
    num,diff,dist,dist_diff,c_num,c_diff,c_dist,c_dist_diff
    ,space_data_c[c_c],space_data_ps[r_idx][c_p],last_char_data_c[c_c],last_char_data_ps[r_idx][c_p]
)
data['char_diff'] = char_diff

append(data,df)


In [12]:
inp = scaler.transform(df.values)
inp_tensor = torch.Tensor(inp)
# it works!
net_two(inp_tensor)

tensor([[0.9707]], grad_fn=<SigmoidBackward0>)

In [13]:
def predict_two(data):
    df = pd.DataFrame(columns = cols_two)
    df = df.fillna(0)
    append(data,df)
    inp = scaler.transform(df.values)
    inp_tensor = torch.Tensor(inp)
    out = net_two(inp_tensor).item()
    if np.isnan(out):
        return 0
    return out
    

In [14]:
predict_two(data)

0.9706765413284302

In [15]:
def iter_tests(p,num):
    """
    iterate over num tests for prob p
    """
    for _ in range(num):
        num_key_mapping = encrypt.generate_key_mapping()
        char_key_mapping = encrypt.char_key_mapping_from_key_mapping(num_key_mapping)

        r = random.randint(0,len(TEST_PLAIN_TEXTS)-1)
        cipher = encrypt.encrypt(TEST_PLAIN_TEXTS[r],num_key_mapping,p)

        yield r,cipher,char_key_mapping

def iter_prob_tests(pmin,pmax,step,num):
    for prob in range(pmin,pmax+1,step):
        print('generating for prob',prob)
        for r_idx,cipher,char_key_mapping in iter_tests(prob/100,num):
            yield r_idx,cipher,char_key_mapping

In [284]:
trial_score_charts = []

In [285]:
# Some Analysis

# Approach 1: Just collect top 3 scores of all classifications. Sum them up and choose best score
correct = 0
total = 0
for r_idx,cipher,char_key_mapping in iter_prob_tests(52,75,100,100):
    char_diff = len(cipher) - len(TEST_PLAIN_TEXTS[0])

    # cipher text pre-processing`
    c_rel_dist,c_rel_num = build_rel_dist(cipher)
    c_rel_num_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_num.items()})
    c_rel_dist_diff = defaultdict(list,{k:get_diff(v) for k,v in c_rel_dist.items()})

    space_char = decrypt.get_space_key_value(cipher)
    space_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[space_char],c_rel_num[c],len(cipher)) for c in _ALPHABET})

    score_charts = []
    length_charts = []
    for i,txt in enumerate(TEST_PLAIN_TEXTS):
        # preprocessing based on plaintext
        last_char_mapping = cipher[-1]
        last_char = TEST_PLAIN_TEXTS[i][-1]
        last_char_data_c = defaultdict(list,{c:get_char_diffs_data(c_rel_num[last_char_mapping],c_rel_num[c],len(cipher)) for c in _ALPHABET})
        
        score_chart = defaultdict(lambda : defaultdict(float))
        length_chart = defaultdict(float)
        for c_c in _ALPHABET:
            length_chart[c_c] = len(c_rel_num)
            for c_p in _ALPHABET:
                
                # narrowing down distributions of interest
                num = rel_nums[i][c_p]
                c_num = c_rel_num[c_c]

                dist = rel_dists[i][c_p]
                c_dist = c_rel_dist[c_c]

                diff = rel_num_diffs[i][c_p]
                c_diff = c_rel_num_diff[c_c]

                dist_diff = rel_dist_diffs[i][c_p]
                c_dist_diff = c_rel_dist_diff[c_c]
                
                data = get_data_two(
                    num,diff,dist,dist_diff,c_num,c_diff,c_dist,c_dist_diff
                    ,space_data_c[c_c],space_data_ps[i][c_p],last_char_data_c[c_c],last_char_data_ps[i][c_p]
                )
                data['char_diff'] = char_diff
                
                score_chart[c_p][c_c] = predict_two(data)
        length_charts.append(length_chart)
        score_charts.append(score_chart)
        
    # use score chart to find r_idx
    trial_score_charts.append({
        "answer":r_idx,
        "score_charts":score_charts,
        "cipher":cipher,
        'length_charts':length_charts,
        'char_mapping':char_key_mapping
    })
#     if basic_technique(score_charts) == r_idx:
#         correct += 1
#     total += 1
    
#     print(correct,total)



generating for prob 52


In [15]:
def basic_technique(score_charts):
    s_vals = []
    for score_chart in score_charts:
        # run the algorithm on score-chart
        s = 0
        for c_p in _ALPHABET:
            best_char = max(score_chart[c_p].items(),key=lambda a:a[1])
            s += best_char[1]
    #         print(c_p,best_char)
        s_vals.append(s)
    return np.argmax(s_vals)

In [16]:
def basic_technique_improved(score_charts):
    s_vals = []
    for score_chart in score_charts:
        # run the algorithm on score-chart
        s = 0
        n = 0
        for c_p in _ALPHABET:
            best_char_records = sorted(score_chart[c_p].items(),key = lambda a : -a[1])
            if best_char_records[0][1] - best_char_records[1][1] > 0.01:
                s += best_char_records[0][1]
                n += 1
        if n>0: 
            s_vals.append(s/n)
        else:
            s_vals.append(0)
    return np.argmax(s_vals)

In [17]:
def basic_technique_length(score_charts,length_charts):
    s_vals = []
    for score_chart,length_chart in zip(score_charts,length_charts):
        # run the algorithm on score-chart
        s = 0
        for c_p in _ALPHABET:
            best_char = max(score_chart[c_p].items(),key=lambda a:a[1])
            s += best_char[1] * length_chart[best_char[0]]
    #         print(c_p,best_char)
        s_vals.append(s)
    return np.argmax(s_vals)

In [18]:
def basic_technique_length_improved(score_charts,length_charts):
    s_vals = []
    for score_chart,length_chart in zip(score_charts,length_charts):
        # run the algorithm on score-chart
        s = 0
        n = 0
        for c_p in _ALPHABET:
            best_char_records = sorted(score_chart[c_p].items(),key = lambda a : -a[1])
            if best_char_records[0][1] - best_char_records[1][1] > 0.4:
                s += best_char_records[0][1]
                n += 1
        if n>0: 
            s_vals.append(s/n)
        else:
            s_vals.append(0)
    return np.argmax(s_vals)

In [290]:
# trials_60
# trials_65
# trials_55
# trials_50
# trials_52

In [19]:
import dill

In [316]:
# with open('trials_65.pkl', 'wb') as handle:
#     dill.dump(trials_65, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
with open('trials_52.pkl', 'rb') as handle:
    trials_52 = dill.load(handle)
    
with open('trials_50.pkl', 'rb') as handle:
    trials_50 = dill.load(handle)

with open('trials_55.pkl', 'rb') as handle:
    trials_55 = dill.load(handle)
    
with open('trials_65.pkl', 'rb') as handle:
    trials_65 = dill.load(handle)
    
with open('trials_60.pkl', 'rb') as handle:
    trials_60 = dill.load(handle)
    

In [22]:
def combine(iterables):
    for iterable in iterables:
        for item in iterable:
            yield item

In [23]:
df_stress_test = pd.DataFrame(columns = ['basic','basic_improved','length_tech'])
""
for i,trial in enumerate(combine([trials_52])):
    score_charts,answer,length_charts = trial['score_charts'],trial['answer'],trial['length_charts']
    basic,improved,length_tech,length_improved = 0,0,0,0
    if basic_technique(score_charts) == answer:
        basic = 1
    if basic_technique_improved(score_charts) == answer:
        improved = 1
    if basic_technique_length(score_charts,length_charts):
        length_tech = 1
    if basic_technique_length_improved(score_charts,length_charts):
        length_improved = 1
    df_stress_test.loc[i,'basic'] = basic
    df_stress_test.loc[i,'basic_improved'] = improved
    df_stress_test.loc[i,'length_tech'] = length_tech
    df_stress_test.loc[i,'length_improved'] = length_improved

In [24]:
len(set(df_stress_test[df_stress_test['basic'] == 1].index))

93

In [25]:
len(df_stress_test[df_stress_test['basic_improved'] == 1].index)

95

In [26]:
len(df_stress_test[df_stress_test['length_tech'] == 1].index)

81

In [27]:
len(df_stress_test[df_stress_test['length_improved'] == 1].index)

83

In [None]:
# backtracking approach.

In [40]:
score_chart = trials_50[0]['score_charts'][4]
ans = trials_50[0]['char_mapping']

In [80]:
answer = 4
req = list(TEST_PLAIN_TEXTS[4].split()[0])

In [81]:
guess_candidates = []
guess_candidates_sets = []
n = 3
for i,c_p in enumerate(req):
    candidates = list(score_chart[c_p].items())
    candidates.sort(key = lambda a:-a[1])
    guess_candidates.append(candidates[:n])
    guess_candidates_sets.append({a[0] for a in candidates[:n]})

In [92]:
for trial in trials_50:
    answer = trial['answer']
    score_chart = trial['score_charts'][4]
    ans = trial['char_mapping']
    req = list(TEST_PLAIN_TEXTS[answer].split()[0])
    
    guess_candidates = []
    guess_candidates_sets = []
    n = 4
    for i,c_p in enumerate(req):
        candidates = list(score_chart[c_p].items())
        candidates.sort(key = lambda a:-a[1])
        guess_candidates.append(candidates[:n])
        guess_candidates_sets.append({a[0] for a in candidates[:n]})
        
    for c_p,candidate_set in zip(req,guess_candidates_sets):
        if not ans[c_p] in candidate_set:
            print(c_p,freqs[answer][c_p])

v 6
n 34
r 36
w 5
a 30
o 36
b 13
o 36
m 8
b 13
v 6
y 11
o 32
r 37
r 37
s 36
m 8
y 11
r 37
k 4
r 37
s 36
v 6
y 11
r 37
r 37
s 36
y 11
r 37
k 4
r 37
s 36
r 44
m 8
b 13
u 22
n 34
w 5
a 30
c 14
y 6
g 21
e 50
t 26
s 49
c 14
y 6
c 14
n 33
t 26
r 37
k 4
r 37
s 36
c 14
g 21
s 49
n 34
a 30
i 32
n 34
w 5
i 32
t 27
y 11
o 32
r 37
k 4
r 37
s 36
o 32
r 37
k 4
r 37
s 36
u 22
n 34
d 9
w 5
a 30
t 27
u 22
n 34
w 5
s 44
t 27
s 44
c 14
y 6
h 14
m 8
b 13
y 11
r 37
k 4
r 37
s 36
y 11
o 32
r 37
k 4
r 37
s 36
y 11
r 37
k 4
r 37
s 36
o 36
m 8
b 13
c 14
y 6
g 21
t 26
c 14
t 26
s 49
y 11
r 37
k 4
r 37
s 36
v 6
v 6
m 8
b 13
v 6
u 22
n 34
r 36
w 5
i 32
s 44
s 44
y 11
r 37
k 4
r 37
s 36
u 22
n 34
d 9
w 5
c 14
n 33
t 26
s 49
v 6
u 12
v 6
y 11
o 32
r 37
r 37
s 36
r 44
m 8
h 14
o 36
m 8
b 13
n 34
d 9
r 36
w 5
i 32
c 14
y 6
g 21
e 50
t 26
y 6
g 21
e 50
t 26
s 49
r 37
k 4
r 37
r 44
m 8
v 6
l 22
y 6
g 21
t 26
s 49
n 34
d 9
w 5
a 30
y 6
t 26
s 49
y 6
n 34
r 36
w 5
o 36
m 8
b 13
v 6
c 14
y 6
t 26
s 49
c 14
y 6
t 26
s 49
c

In [85]:
for c_p,candidate_set in zip(req,guess_candidates_sets):
    if not ans[c_p] in candidate_set:
        print('fuck')
        print(freqs[4][c_p])

fuck
6


In [119]:
my_set = set()
for c_p,probs in score_chart.items():
    probs = list(probs.items())
    probs.sort(key = lambda a:-a[1])
    print(probs[:2])
    print(char_key_mapping[c_p])
    for i in range(2):
        if probs[i][0] == char_key_mapping[c_p]:
            my_set.add(c_p)
            break
    

[('t', 0.9378145933151245), ('q', 0.00016141246305778623)]
t
[('y', 0.8689349889755249), ('h', 0.5109328627586365)]
y
[('e', 0.973141610622406), ('u', 0.9477119445800781)]
u
[('m', 0.978762149810791), ('u', 0.011534040793776512)]
m
[('g', 0.7696906924247742), ('a', 0.3600529134273529)]
g
[('q', 0.9340885877609253), ('p', 0.34357988834381104)]
q
[('s', 0.750238299369812), ('f', 0.7495081424713135)]
w
[(' ', 0.9279904961585999), ('a', 0.7020502090454102)]
a
[('v', 0.9490692615509033), ('e', 0.9476191401481628)]
v
[('h', 0.9274654984474182), ('y', 0.9260985255241394)]
k
[(' ', 0), ('a', 0)]
n
[('j', 0.9447077512741089), ('x', 0.8385509848594666)]
x
[('a', 0.9921000599861145), ('o', 0.9888262748718262)]
d
[('c', 0.792305588722229), ('x', 0.7656188011169434)]
c
[('o', 0.9139175415039062), ('a', 0.9096418619155884)]
o
[('h', 0.9707360863685608), ('k', 0.9190825819969177)]
h
[('l', 0.9908012747764587), ('b', 0.8771774172782898)]
l
[(' ', 0), ('a', 0)]
s
[('i', 0.9969356060028076), ('q', 0.032