# Задание 1.
Обучите нейронную сеть решать шифр Цезаря.

Что необходимо сделать:

1. Написать алгоритм шифра Цезаря для генерации выборки (сдвиг на К каждой буквы. Например, при сдвиге на 2 буква “А” переходит в букву “В” и тп)
2. Сделать нейронную сеть
3. Обучить ее (вход - зашифрованная фраза, выход - дешифрованная фраза)
4. Проверить качество

In [None]:
import torch
import pandas as pd
import time
from sklearn.model_selection import train_test_split
import re
import copy
import random

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Caesar cipher

In [None]:
def caesar(string, num):
    output = ''
    for c in string:
        if c.isalpha():
            new_num = ord(c) + num
            if new_num > ord('z'):
                new_num -= 26
            output += chr(new_num)
        else:
            output += c
    return output

In [None]:
caesar('abcdf', 1)

'bcdeg'

In [None]:
df = pd.read_csv('data.csv').iloc[:,[-2]]
df = df.dropna(subset = ['normalized_text'])
df

Unnamed: 0,normalized_text
0,maggie look whats that
1,lee-mur lee-mur
2,zee-boo zee-boo
3,im trying to teach maggie that nature doesnt e...
4,its like an ox only it has a hump and a dewlap...
...,...
11634,too bad we didnt come dressed as popular carto...
11635,yeah mom guess what for a dollar a man sold me...
11636,hows it going bart
11637,maybe you need to play on their sympathies mor...


In [None]:
# установим произвольный сдвиг в составе шифра Цезаря до 10 символов
# шифр индивидуален для каждой записи
df['shift']   = [random.randint(1, 10) for i in range(len(df))]

# исходная реплика
df['real_phrase'] = [' '.join(re.findall('[\w]+', i)) for i in df['normalized_text']]

# зашифрованная реплика
df['cipher_phrase']= df.loc[:, ['real_phrase', 'shift']].apply(lambda row: caesar(row['real_phrase'], row['shift']), axis=1)

df = df.iloc[:, 1:]
df.head()

Unnamed: 0,shift,real_phrase,cipher_phrase
0,9,maggie look whats that,vjpprn uxxt fqjcb cqjc
1,10,lee mur lee mur,voo web voo web
2,9,zee boo zee boo,inn kxx inn kxx
3,9,im trying to teach maggie that nature doesnt e...,rv cahrwp cx cnjlq vjpprn cqjc wjcdan mxnbwc n...
4,5,its like an ox only it has a hump and a dewlap...,nyx qnpj fs tc tsqd ny mfx f mzru fsi f ijbqfu...


## Translate phrases into tensor

In [None]:
train, test = train_test_split(df, test_size = 0.2)
train_list = train['cipher_phrase'].tolist()
train_ceaser =  train['real_phrase'].tolist()
test_list = test['cipher_phrase'].tolist()
test_ceaser = test['real_phrase'].tolist()

In [None]:
train_text =  [[c for c in ph] for ph in train_list if type(ph) is str]
train_label = [[c for c in ph] for ph in train_ceaser if type(ph) is str]
test_text = [[c for c in ph] for ph in test_list if type(ph) is str]
test_label = [[c for c in ph] for ph in test_ceaser if type(ph) is str]

In [None]:
ALPHABET = ['none'] + [w for w in set('abcdefghijklmnopqrstuvwxyz')]
INDEX_TO_CHAR = {i : w for i, w in enumerate(ALPHABET)}
CHAR_TO_INDEX = {w : i for i, w in enumerate(ALPHABET)}
MAX_LEN = 50

def convert_to_torch(text):
    output = torch.zeros((len(text), MAX_LEN), dtype=int)
    for i in range(len(text)):
        for j, w in enumerate(text[i]):
            if j >= MAX_LEN:
                break
            output[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])
    return output

In [None]:
X_train = convert_to_torch(train_text)
Y_train = convert_to_torch(train_label)
X_test = convert_to_torch(test_text)
Y_test = convert_to_torch(test_label)

## RNN

In [None]:
class RNN_Network(torch.nn.Module):
  def __init__(self):
    super(RNN_Network, self).__init__()
    self.embedding = torch.nn.Embedding(len(ALPHABET), 28)
    self.rnn = torch.nn.RNN(28, 128, batch_first = True)
    self.linear = torch.nn.Linear(128, 28)
  
  def forward(self, sentences, state=None):
    embd = self.embedding(sentences)
    out, new_state = self.rnn(embd, state)
    result = self.linear(out)
    return result, new_state 

In [None]:
model = RNN_Network().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0002)
loss_best = 10 ** 10

## Train

In [None]:
iter_n = 100
for iter in range(iter_n):
    start = time.time()
    train_loss = 0.
    train_passed = 0
    test_loss = 0
    test_passed = 0

    for i in range(int(len(X_train) / 100)):
        X_batch = X_train[i * 100:(i + 1) * 100].to(device)
        Y_batch = Y_train[i * 100:(i + 1) * 100].flatten().to(device)
        model.train()

        optimizer.zero_grad()
        answers, _ = model.forward(X_batch)
        answers = answers.view(-1, len(ALPHABET))
        loss = criterion(answers, Y_batch).to(device)

        if loss < loss_best:
          model_best = copy.copy(model)
          loss_best = loss

        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_passed += 1

    with torch.no_grad():
      model.eval()
      answer, _ = model.forward(X_test.to(device))
      answer = answer.view(-1, len(ALPHABET))
      loss = criterion(answer, Y_test.flatten().to(device)) 
      test_loss += loss.item()
      test_passed += 1

    if iter % (iter_n // 10) == 0:
      print(f'Epoch {iter}. Time: {time.time() - start:.3f}, Train loss: {train_loss / train_passed:.3f}, Test loss: {test_loss / test_passed:.6f}')

Epoch 0. Time: 0.630, Train loss: 0.293, Test loss: 0.353544
Epoch 10. Time: 0.248, Train loss: 0.257, Test loss: 0.333702
Epoch 20. Time: 0.243, Train loss: 0.227, Test loss: 0.319417
Epoch 30. Time: 0.251, Train loss: 0.212, Test loss: 0.302252
Epoch 40. Time: 0.255, Train loss: 0.192, Test loss: 0.301722
Epoch 50. Time: 0.246, Train loss: 0.255, Test loss: 0.279291
Epoch 60. Time: 0.263, Train loss: 0.170, Test loss: 0.292856
Epoch 70. Time: 0.249, Train loss: 0.161, Test loss: 0.300110
Epoch 80. Time: 0.250, Train loss: 0.172, Test loss: 0.286459
Epoch 90. Time: 0.255, Train loss: 0.146, Test loss: 0.296454


## Text decoding

In [None]:
text = convert_to_torch([[c for c in ph] for ph in df['cipher_phrase'] if type(ph) is str])

In [None]:
df['text_predict']= [''.join([INDEX_TO_CHAR[i.item()] for i 
                    in model_best(text.to(device))[0][line].argmax(dim=1).detach()])
                    for line in range(df.shape[0])]

In [None]:
df.head()

Unnamed: 0,shift,real_phrase,cipher_phrase,text_predict
0,9,maggie look whats that,vjpprn uxxt fqjcb cqjc,thonol suuk yogay anganonenonenonenonenonenone...
1,10,lee mur lee mur,voo web voo web,the tas mie murnonenonenonenonenonenonenonenon...
2,9,zee boo zee boo,inn kxx inn kxx,hhe boo yee boononenonenonenonenonenonenonenon...
3,9,im trying to teach maggie that nature doesnt e...,rv cahrwp cx cnjlq vjpprn cqjc wjcdan mxnbwc n...,im trying to teach maggie that nature doesnt e...
4,5,its like an ox only it has a hump and a dewlap...,nyx qnpj fs tc tsqd ny mfx f mzru fsi f ijbqfu...,ios lile an ow only it has a hump and a devlap...


Хорошее предсказание на длинных репликах, на коротких не важно, нужно больше итераций?

## Quality

In [None]:
loss_best

tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward0>)