In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import random
import time
from IPython.display import clear_output
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(1111)
random.seed(1111)

device

device(type='cuda')

# **Utils**

In [2]:
def encrypt(text, shift, alphabet):
    new_alphabet = alphabet[shift % len(alphabet):] + alphabet[:shift % len(alphabet)]
    result = ''
    for char in text:
        if char in alphabet:
            result += new_alphabet[alphabet.index(char)]
        else:
            result += char
    return result

def decrypt(text, shift, alphabet):
    return encrypt(text, -shift, alphabet)

def generate_data(alphabet, max_len=10, shift=3):
    length = random.randint(5, max_len)
    text = ''.join(random.choice(alphabet) for _ in range(length))
    encrypted = encrypt(text, shift, alphabet=alphabet)
    return {'text': text, 'encrypted': encrypted, 'length': length}

def text_to_one_hot(text, alphabet):
    seq_len = len(text)
    one_hot = np.zeros((seq_len, len(alphabet)), dtype=np.float32)
    for i, char in enumerate(text):
        if char in alphabet:
            one_hot[i, alphabet.index(char)] = 1
    return torch.tensor(one_hot)

def dataset_to_tensors(dataset, alphabet):
    max_len = max(pair['length'] for pair in dataset)
    X = torch.zeros((len(dataset), max_len, len(alphabet)), dtype=torch.float32)
    y = torch.zeros((len(dataset), max_len, len(alphabet)), dtype=torch.float32)
    
    for i, pair in enumerate(dataset):
        X[i, :len(pair['encrypted'])] = text_to_one_hot(pair['encrypted'], alphabet)
        y[i, :len(pair['text'])] = text_to_one_hot(pair['text'], alphabet)
    
    return X, y

class CaesarNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CaesarNet, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
    
    def forward(self, sentence):
        o, _ = self.rnn(sentence)
        return self.linear(o)

    def train_model(self, train_loader, alphabet_size, lr, epochs=10):
        self.to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=1e-4)
        start = time.time()
        
        for epoch in range(epochs):
            self.train()
            train_loss, train_acc, train_iter_num = 0., 0., 0.
            
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                
                optimizer.zero_grad()
                output = self(X_batch)
                y_indices = torch.argmax(y_batch, dim=-1)
                
                loss = criterion(output.view(-1, alphabet_size), y_indices.view(-1))
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                batch_acc = (output.argmax(dim=-1) == y_indices).float().mean().item()
                train_acc += batch_acc
                train_iter_num += 1
            
            clear_output(wait=True)
            print(
                f"Epoch: {epoch+1}, loss: {train_loss/train_iter_num:.4f}, acc: "
                f"{train_acc/train_iter_num:.4f}, "
                f"{time.time() - start:.2f} sec."
            )
        
        return self

    def predict(self, text, alphabet):
        self.eval()
        best_shift = 0
        best_score = -1.
        best_pred_text = ""
        
        for shift in range(len(alphabet)):
            encrypted = encrypt(text, shift, alphabet)
            X_test = text_to_one_hot(encrypted, alphabet)[None, :]
            with torch.no_grad():
                X_test = X_test.to(device)
                pred = self(X_test)
            pred_text = ''.join(alphabet[torch.argmax(char).item()] for char in pred[0][:len(text)])
            
            matches = sum(1 for c1, c2 in zip(text, pred_text) if c1 == c2)
            score = matches / len(text) if len(text) > 0 else 0.
            
            if score > best_score:
                best_score = score
                best_shift = shift
                best_pred_text = pred_text
        
        encrypted = encrypt(text, best_shift, alphabet)
        return encrypted, best_pred_text, best_shift

# **Creation Alphabet**

In [3]:
alphabet = 'abcdefghijklmnopqrstuvwxyz' + 'abcdefghijklmnopqrstuvwxyz'.upper() + 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' + 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'.upper() + r'0123456789,. ?|/\[]()";:'

# **Variables**

In [4]:
alphabet_size = len(alphabet)
input_size = output_size = alphabet_size
hidden_size = 128
batch_size = 32
learning_rate = .001
num_epochs = 10
shift = random.randint(-3555456654575, 3555456654575) % alphabet_size

In [5]:
shift

109

# **Creation initial dataset**

In [6]:
dataset_init = [generate_data(alphabet, max_len=250, shift=shift) for _ in range(10000)]
X, y = dataset_to_tensors(dataset_init, alphabet)

In [7]:
dataset = TensorDataset(X, y)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# **Creation Model**

In [8]:
model = CaesarNet(input_size, hidden_size, output_size)
model

CaesarNet(
  (rnn): RNN(142, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=142, bias=True)
)

# **Training model**

In [9]:
model = model.train_model(train_loader, alphabet_size, learning_rate, num_epochs)

Epoch: 10, loss: 0.0203, acc: 1.0000, 22.08 sec.


# **Experiment 1: with bruteforce shift**

## **Testing Results**

In [10]:
text = "Привет мир, это я DJ 23"
encrypted, predicted, learning_shift = model.predict(text, alphabet)
print(f"Original: {text}")
print(f"Encrypted: {encrypted}")
print(f"Predicted: {predicted}")
print(f"Learning Shift: {learning_shift}")
print(f"Initial Shift: {shift}")

Original: Привет мир, это я DJ 23
Encrypted: пKCvyMЛGCKЙЛXMIЛZЛ)cЛВГ
Predicted: Привет мир, это я DJ 23
Learning Shift: 109
Initial Shift: 109


In [11]:
text = dataset_init[9999]['text']
encrypted, predicted, learning_shift = model.predict(text, alphabet)
print(f"Original: {text}")
print(f"Encrypted: {encrypted}")
print(f"Predicted: {predicted}")
print(f"Learning Shift: {learning_shift}")
print(f"Initial Shift: {shift}")

Original: nN/wHПЖrЖYyьRhТАIgсбЖиАVmЫLЩщРf9ZASc6Ц9zxЙcз"Ж1пщаИu]чЗкelf:ар1ШATEсКR1Ьуk:]uЬгЯ3s|yrц9еNAШ мкЕ7зюeОЮЖ ИокгщTw]бlyьЧл42ЩиLYkеЙ3QS5V3пШ/гтDwRjLДя"YЛЙыУlжrgуцФчz9ашБmрHETrРзБ)жOdйrd:|CйfЙoZ)БааRHrшJ9CdOyАэдaэщ
Encrypted: 4gО?aпж8жr/WkЮтаbЭLuжCаo3ыeщTрЬИs[lЩЁцИ\|йЩBФжБJTtи.СRзEЫ2ЬЦtKБш[m"LкkБьN1ЦС.ьwяГ9Н/8QИyg[шЛGEеЖBYЫоюжЛиIEwTm?Сu2/WчFДВщCer1yйГjlЕoГJшОwM)?k0eдZФrлйVу2A8ЭNQфR\ИtSб3Ka"m8рBбУAhЪD8ЪЦН(DЬй5sУбttka8ScИ(Ъh/аXxЧXT
Predicted: nN/wHПЖrЖYyьRhТАIgсбЖиАVmЫLЩщРf9ZASc6Ц9zxЙcз"Ж1пщаИu]чЗкelf:ар1ШATEсКR1Ьуk:]uЬгЯ3s|yrц9еNAШ мкЕ7зюeОЮЖ ИокгщTw]бlyьЧл42ЩиLYkеЙ3QS5V3пШ/гтDwRjLДя"YЛЙыУlжrgуцФчz9ашБmрHETrРзБ)жOdйrd:|CйfЙoZ)БааRHrшJ9CdOyАэдaэщ
Learning Shift: 109
Initial Shift: 109


# **Experiment 2: with trainable shift**

In [12]:
class CaesarNetTrainShift(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CaesarNetTrainShift, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.shift = nn.Parameter(torch.tensor(float(random.randint(0, input_size - 1))))
        self.alphabet_size = input_size

    def forward(self, sentence):
        shift = torch.round(self.shift).long() % self.alphabet_size
        shifted_sentence = torch.roll(sentence, shifts=shift.item(), dims=-1)
        o, _ = self.rnn(shifted_sentence)
        return self.linear(o)

    def train_model(self, train_loader, alphabet_size, lr, epochs=10):
        self.to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam([
        {'params': [self.shift], 'lr': .01},
        {'params': [p for p in self.parameters() if p is not self.shift], 'lr': lr}
    ], weight_decay=1e-4)
        start = time.time()
        
        for epoch in range(epochs):
            self.train()
            train_loss, train_acc, train_iter_num = 0., 0., 0.
            
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                
                optimizer.zero_grad()
                output = self(X_batch)
                y_indices = torch.argmax(y_batch, dim=-1)
                
                loss = criterion(output.view(-1, alphabet_size), y_indices.view(-1))
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                batch_acc = (output.argmax(dim=-1) == y_indices).float().mean().item()
                train_acc += batch_acc
                train_iter_num += 1
            
            clear_output(wait=True)
            print(
                f"Epoch: {epoch+1}, loss: {train_loss/train_iter_num:.4f}, acc: "
                f"{train_acc/train_iter_num:.4f},"
                f"{time.time() - start:.2f} sec."
            )
        
        return self
    
    def predict(self, text, alphabet, level=1/2):
        self.eval()

        learning_shift = int(round(self.shift.item())) % len(alphabet)
        best_shift = 0
        best_score = -1.
        best_pred_text = ""
        
        for shift in range(learning_shift, len(alphabet)):
            encrypted = encrypt(text, shift, alphabet)
            X_test = text_to_one_hot(encrypted, alphabet)[None, :]
            with torch.no_grad():
                X_test = X_test.to(device)
                pred = self(X_test)
            pred_text = ''.join(alphabet[torch.argmax(char).item()] for char in pred[0][:len(text)])
            
            matches = sum(1 for c1, c2 in zip(text, pred_text) if c1 == c2)
            score = matches / len(text) if len(text) > 0 else 0.
            
            if score > best_score:
                best_score = score
                best_shift = shift
                best_pred_text = pred_text
            
            if score > level:
                break
        
        encrypted = encrypt(text, best_shift, alphabet)
        return encrypted, best_pred_text, best_shift

In [13]:
model1 = CaesarNetTrainShift(input_size, hidden_size, output_size)
model1

CaesarNetTrainShift(
  (rnn): RNN(142, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=142, bias=True)
)

In [14]:
model1 = model1.train_model(train_loader, alphabet_size, learning_rate, num_epochs)

Epoch: 10, loss: 0.0204, acc: 1.0000,27.38 sec.


In [15]:
model1.shift

Parameter containing:
tensor(61., device='cuda:0', requires_grad=True)

In [16]:
text = "Привет мир, это я DJ 23"
encrypted, predicted, learning_shift = model1.predict(text, alphabet)
print(f"Original: {text}")
print(f"Encrypted: {encrypted}")
print(f"Predicted: {predicted}")
print(f"Learning Shift: {learning_shift}")
print(f"Initial Shift: {shift}")

Original: Привет мир, это я DJ 23
Encrypted: пKCvyMЛGCKЙЛXMIЛZЛ)cЛВГ
Predicted: Привет мир, это я DJ 23
Learning Shift: 109
Initial Shift: 109


# **Simpsons Dataset**

## **Creation dataset**

In [33]:
df = pd.read_csv('~/Downloads/simpsons_script_lines.csv', low_memory=False)
print(df.dtypes)
df.head(10)

id                      int64
episode_id              int64
number                  int64
raw_text               object
timestamp_in_ms        object
speaking_line          object
character_id           object
location_id           float64
raw_character_text     object
raw_location_text      object
spoken_words           object
normalized_text        object
word_count             object
dtype: object


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31.0
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3.0
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22.0
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5.0
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33.0
5,9554,32,214,Martin Prince: (HOARSE WHISPER) I don't think ...,877000,True,38.0,3.0,Martin Prince,Springfield Elementary School,I don't think there's anything left to say.,i dont think theres anything left to say,8.0
6,9555,32,215,Edna Krabappel-Flanders: Bart?,881000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,Bart?,bart,1.0
7,9556,32,216,Bart Simpson: Victory party under the slide!,882000,True,8.0,3.0,Bart Simpson,Springfield Elementary School,Victory party under the slide!,victory party under the slide,5.0
8,9557,32,217,(Apartment Building: Ext. apartment building -...,889000,False,,374.0,,Apartment Building,,,
9,9558,32,218,Lisa Simpson: (CALLING) Mr. Bergstrom! Mr. Ber...,889000,True,9.0,374.0,Lisa Simpson,Apartment Building,Mr. Bergstrom! Mr. Bergstrom!,mr bergstrom mr bergstrom,4.0


In [34]:
phrases = df['normalized_text'].tolist()
phrases[:10]

['no actually it was a little of both sometimes when a disease is in all the magazines and all the news shows its only natural that you think you have it',
 'wheres mr bergstrom',
 'i dont know although id sure like to talk to him he didnt touch my lesson plan what did he teach you',
 'that life is worth living',
 'the polls will be open from now until the end of recess now just in case any of you have decided to put any thought into this well have our final statements martin',
 'i dont think theres anything left to say',
 'bart',
 'victory party under the slide',
 nan,
 'mr bergstrom mr bergstrom']

In [36]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]

In [38]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс
len(INDEX_TO_CHAR)

28

In [41]:
MAX_LEN = 50  # мы хотим ограничить максимальную длину ввода
X = torch.zeros((len(text), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(text)):  # для каждого предложения
    for j, w in enumerate(text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [44]:
X.shape

torch.Size([132087, 50])

In [None]:
embeddings = nn.Embedding(len(INDEX_TO_CHAR), 28)  # размер словаря * размер вектора для кодировки каждого слова
t = embeddings(X[0:5])
t.shape

torch.Size([5, 50, 28])

# **Реализация сети с RNN**

In [48]:
class CustomRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomRNNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.W_xh = nn.Linear(input_size, hidden_size)
        self.W_hh = nn.Linear(hidden_size, hidden_size)
        self.tanh = nn.Tanh()

    def forward(self, x, h_prev):
        h_t = self.tanh(self.W_xh(x) + self.W_hh(h_prev))
        return h_t


class CustomRNNNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(CustomRNNNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_cell = CustomRNNCell(embedding_dim, hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        batch_size, seq_len, _ = x.size()
        
        if state is None:
            state = torch.zeros(batch_size, self.hidden_size)
        
        outputs = []
        for t in range(seq_len):
            state = self.rnn_cell(x[:, t, :], state)
            outputs.append(state)
        
        outputs = torch.stack(outputs, dim=1)
        return self.out(outputs)

In [49]:
vocab_size = len(INDEX_TO_CHAR)
embedding_dim = 30
hidden_size = 128

model = CustomRNNNetwork(vocab_size, embedding_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=.05)

In [50]:
for ep in range(20):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 17.751, Train loss: 1.671
Epoch 1. Time: 17.618, Train loss: 1.491
Epoch 2. Time: 17.767, Train loss: 1.418
Epoch 3. Time: 17.631, Train loss: 1.369
Epoch 4. Time: 17.474, Train loss: 1.333
Epoch 5. Time: 17.508, Train loss: 1.304
Epoch 6. Time: 17.483, Train loss: 1.281
Epoch 7. Time: 17.626, Train loss: 1.262
Epoch 8. Time: 18.001, Train loss: 1.245
Epoch 9. Time: 18.532, Train loss: 1.231
Epoch 10. Time: 18.507, Train loss: 1.218
Epoch 11. Time: 17.780, Train loss: 1.207
Epoch 12. Time: 17.714, Train loss: 1.198
Epoch 13. Time: 17.951, Train loss: 1.189
Epoch 14. Time: 17.628, Train loss: 1.181
Epoch 15. Time: 17.390, Train loss: 1.174
Epoch 16. Time: 17.936, Train loss: 1.168
Epoch 17. Time: 17.627, Train loss: 1.162
Epoch 18. Time: 17.415, Train loss: 1.156
Epoch 19. Time: 17.429, Train loss: 1.151


In [62]:
def generate_sentence(word):
    sentence = list(word.lower())
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor(sentence, dtype=torch.long).unsqueeze(0)
        answers = model(inputs)
        probas, indices = answers.topk(1)
        return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [57]:
CHAR_TO_INDEX['none']

0

In [88]:
generate_sentence('simpso')

'ompson'