### Libraries 

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import numpy as np
import pandas as pd
import time

import random
import string

### Ceasar Cipher Implementation

In [2]:
sample = 'Gaius Julius Caesar was born into a patrician family, the gens Julia on 12 July 100 BC.'
shift = 17
alphabet = 'ABCDEFGHIJKLMNOPQRSTVUWXYZabcdefghijklmnopqrstvuwxyz 0123456789'

In [3]:
#   simple Ceasar Cipher
def ceasar_cipher(text, shift, alphabet, decryption=False):
    output = ''
    if decryption == True:
        shift = -shift
    for x in range(len(text)):
        if text[x] in alphabet:
            output += alphabet[(alphabet.index(text[x]) + shift) % len(alphabet)]
        else:
            output += text[x] 
    return output

print('Original text:\n', sample)
enc_data = ceasar_cipher(sample, shift, alphabet)
print('Encrypted text:\n', enc_data)
dec_data = ceasar_cipher(enc_data, shift, alphabet, decryption=True)
print('Decrypted text:\n', dec_data)

Original text:
 Gaius Julius Caesar was born into a patrician family, the gens Julia on 12 July 100 BC.
Encrypted text:
 XrzB8GaB1zB8GTru8r7GCr8Gs473Gz394GrG5r97ztzr3Gwr2z1E,G9yuGxu38GaB1zrG43GIJGaB1EGIHHGST.
Decrypted text:
 Gaius Julius Caesar was born into a patrician family, the gens Julia on 12 July 100 BC.


In [4]:
#   function for generate train sequences
def generate_random_string(length_sen, num):
    np_alphabet = np.array(list(alphabet))
    np_codes = np.random.choice(np_alphabet, [num, length_sen])
    return [''.join(np_codes[i]) for i in range(len(np_codes))]

In [5]:
#    creating dataframe for visibility
df = pd.DataFrame(generate_random_string(50, 100))
df.columns = ['original_sentence']
df.head()

Unnamed: 0,original_sentence
0,znAvSD5lug7vKpkc5mgp4DSZ UVdz0mc9FQ079RNINc20g...
1,Gf9dq6xm8vJsiQL5a8il5sOQK0pA0nYHjqvHoKB4sZWPfG...
2,hqdKem54iDFW5kdjRtLplt6FK9eBmN80I9TRikumiE eSM...
3,wbohCWhFlevkIAeWS RMiyabqHOvGsSxPsMe59hv4xUyhQ...
4,KAk WfuTsV3heh4ekVWw3kIWGTa7M4v3X5xZ54ZRiK1d 2...


In [6]:
#    adding encrypted sentences to dataframe
df['encrypted_sentence'] = df['original_sentence'].apply(lambda x: ceasar_cipher(x, shift, alphabet))
df.head()

Unnamed: 0,original_sentence,encrypted_sentence
0,znAvSD5lug7vKpkc5mgp4DSZ UVdz0mc9FQ079RNINc20g...,F3RAjVM1BxOAb50tM2x5LVjqGmlvFH2tQWhHOQieZetJHx...
1,Gf9dq6xm8vJsiQL5a8il5sOQK0pA0nYHjqvHoKB4sZWPfG...,XwQv6ND2PAa8zhcMrPz1M8fhbH5RH3pY 6AY4bSL8qngwX...
2,hqdKem54iDFW5kdjRtLplt6FK9eBmN80I9TRikumiE eSM...,y6vbu2MLzVWnM0v i9c519NWbQuS2ePHZQkiz0B2zUGujd...
3,wbohCWhFlevkIAeWS RMiyabqHOvGsSxPsMe59hv4xUyhQ...,Cs4yTnyW1uA0ZRunjGidzErs6YfAX8jDg8duMQyALDmEyh...
4,KAk WfuTsV3heh4ekVWw3kIWGTa7M4v3X5xZ54ZRiK1d 2...,bR0GnwBk8lKyuyLu0lnCK0ZnXkrOdLAKoMDqMLqizbIvGJ...


### Preparation for training

In [7]:
#    creating dictionary from 'alphabet' and adding 'None'
dict_char = {char: i for i, char in enumerate(['None'] + [char for char in alphabet])}

In [8]:
#    creating listes of characters of original and encrypted sequences
sentence_corpus_orig = [[char for char in sentence] for sentence in df['original_sentence'].tolist()]
sentence_corpus_enc = [[char for char in sentence] for sentence in df['encrypted_sentence'].tolist()]

print(sentence_corpus_orig[0][:15])
print(sentence_corpus_enc[0][:15])

['z', 'n', 'A', 'v', 'S', 'D', '5', 'l', 'u', 'g', '7', 'v', 'K', 'p', 'k']
['F', '3', 'R', 'A', 'j', 'V', 'M', '1', 'B', 'x', 'O', 'A', 'b', '5', '0']


In [9]:
#    function for creating tensors with fixed length 
def to_torch(text, sen_len, dictionary):
    X = torch.zeros((len(text), sen_len), dtype=int)
    for i in range(len(text)):
        for j, k in enumerate(text[i]):
            if j >= sen_len:
                break
            #    if length of a sequence is less than the fixed length of a tensor, fill with 'None' (aka Padding)
            X[i,j] = dict_char.get(k, dict_char['None'])
    return X

In [10]:
#   making length of row in tensor
len_tensor = 60

In [11]:
#   encrypted sequences -> tensor
X_enc = to_torch(sentence_corpus_enc, len_tensor, dict_char)
print(X_enc.size())
print(X_enc[0])

torch.Size([100, 60])
tensor([ 6, 57, 18,  1, 36, 21, 13, 55,  2, 50, 15,  1, 28, 59, 54, 46, 13, 56,
        50, 59, 12, 21, 36, 43,  7, 39, 38, 47,  6,  8, 56, 46, 17, 23, 34,  8,
        15, 17, 35, 31, 26, 31, 46, 10,  8, 50, 36,  7, 28, 25,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0])


In [12]:
#   original sequences -> tensor
X_orig = to_torch(sentence_corpus_orig, len_tensor, dict_char)
print(X_orig.size())
print(X_orig[0])

torch.Size([100, 60])
tensor([52, 40,  1, 47, 19,  4, 59, 38, 48, 33, 61, 47, 11, 42, 37, 29, 59, 39,
        33, 42, 58,  4, 19, 26, 53, 22, 21, 30, 52, 54, 39, 29, 63,  6, 17, 54,
        61, 63, 18, 14,  9, 14, 29, 56, 54, 33, 19, 53, 11,  8,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0])


### RNN 

In [13]:
#    define simple RNN with torch
class RNN(torch.nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embed = torch.nn.Embedding(len(dict_char), len(dict_char))
        self.rnn = torch.nn.RNN(len(dict_char), 256, batch_first=True)
        self.linear = torch.nn.Linear(256, len(dict_char))
        
    def forward(self, sentences, state=None):
        embed = self.embed(sentences)
        o, a = self.rnn(embed)
        out = self.linear(o)
        return out
    
model = RNN().to(device)

In [14]:
#    define criterion (cross entropy) and optimizer (Adam)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.005)

### Training

In [15]:
#    training
for ep in range(10):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X_enc))):
        X_orig = X_orig.flatten().to(device)

        optimizer.zero_grad()
        answers = model.forward(X_enc.to(device))
        answers = answers.view(-1, len(dict_char))
        loss = criterion(answers, X_orig)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1
   
    print("Epoch {}. Time: {:.3f}, Train loss: {:.6f}".format(ep+1, time.time() - start, train_loss / train_passed))

Epoch 1. Time: 1.611, Train loss: 0.221189
Epoch 2. Time: 1.474, Train loss: 0.000174
Epoch 3. Time: 1.468, Train loss: 0.000129
Epoch 4. Time: 2.981, Train loss: 0.000101
Epoch 5. Time: 3.261, Train loss: 0.000082
Epoch 6. Time: 3.159, Train loss: 0.000068
Epoch 7. Time: 3.286, Train loss: 0.000057
Epoch 8. Time: 3.162, Train loss: 0.000048
Epoch 9. Time: 3.220, Train loss: 0.000042
Epoch 10. Time: 3.204, Train loss: 0.000036


In [16]:
#   function for predict 
def prediction(test):
    test_enc = [ceasar_cipher(test, shift, alphabet)]
    test_tensor = to_torch(test_enc, 100, dict_char)
    predict = model(test_tensor.to(device))
    predict = predict.squeeze(0)
    predict_test = ''
    for i, j in enumerate(predict):
        if j.argmax() != 0:
            predict_test += list(dict_char.keys())[list(dict_char.values()).index(j.argmax())]
    return predict_test

In [17]:
prediction("Gaius Julius Caesar was a Roman general and statesman")

'Gaius Julius Caesar was a Roman general and statesman'

### How to measure the quality of the Decryptor?

The quality of a decoder is determined by its ability to decode all characters in the input sequence. The distortion of at least one character in the output is unacceptable. Therefore, we will measure the quality of the model as follows: 

- we feed the encrypted sequence to the model input
- run it through the model and compare output with the input
- if at least one symbol is decoded incorrectly, we consider that the whole sentence is erroneous

In [18]:
model.eval()
n_test = 100
k=0
start = time.time()
with torch.no_grad():
    for i in range(n_test):
        true_sentence = generate_random_string(50, 1)
        predict_sentence = prediction(true_sentence[0])
        if true_sentence[0] == predict_sentence:
            k+=1
        
print('Correct answers: {:.2f}%'.format(k/n_test * 100))
print('{:.2f} seconds'.format(time.time() - start))

Correct answers: 100.00%
14.44 seconds
