In [1]:
import torch
import warnings
import pandas as pd
import numpy as np
import string
import time
import re

warnings.filterwarnings("ignore")

torch.cuda.is_available = lambda : False
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

Глобальные переменные

In [2]:
BATCH_SIZE = 10
NUM_EPOCHS = 20
LEARNING_RATE = 0.01
file_path = 'HarryPotter.txt'
string_size = 60

Шифр Цезаря

In [3]:
class Cipher():
    def __init__(self, step):
        self.step = step
        self.alphabet = string.ascii_letters + '.!? '
        self.len_a = len(self.alphabet)

    def encrypt(self, plaintext):
        ciphertext = ''
        for c in plaintext:
            if c in self.alphabet:
                ciphertext += self.alphabet[(self.alphabet.index(c) + self.step) % self.len_a]
            else:
                ciphertext += c
        return ciphertext

    def decrypt(self, ciphertext):
        plaintext = ''
        for c in ciphertext:
            if c in self.alphabet:
                plaintext += self.alphabet[(self.alphabet.index(c) - self.step) % self.len_a]
            else:
                plaintext += c
        return plaintext
    
cipher = Cipher(5)
len_a = cipher.len_a
alphabet = cipher.alphabet
cipher.decrypt(cipher.encrypt('Harry Potter and the Goblet of Fire'))

'Harry Potter and the Goblet of Fire'

Генерация тензоров на обучение

In [4]:
def make_tensor(file_path, step):
    text_array = []
    with open(file_path, errors = 'ignore') as file:
        while True:
            text = file.read(step)
            if not text:
                break
            text_array.append(re.sub(r'[^a-zA-Z.!? ]', r' ', text))
    del text_array[-1]
    y_train = torch.tensor([sent_to_index(lines) for lines in text_array[:4*len(text_array) // 5]])
    x_train = torch.tensor([sent_to_index(cipher.encrypt(lines)) for lines in text_array[:4*len(text_array) // 5]])

    y_test = torch.tensor([sent_to_index(lines) for lines in text_array[4*len(text_array) // 5:]])
    x_test = torch.tensor([sent_to_index(cipher.encrypt(lines)) for lines in text_array[4*len(text_array) // 5:]])

    return x_train, y_train, x_test, y_test

def sent_to_index(sentence):
    return [alphabet.find(y) for y in sentence]

x_train, y_train, x_test, y_test = make_tensor(file_path, string_size)

DataLoader

In [5]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self._len = len(x)
        self.y = y
        self.x = x
    def __len__(self):
        return self._len
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
train = DataLoader(MyDataset(x_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test = DataLoader(MyDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=True)

In [6]:
class RNNModel(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.embed = torch.nn.Embedding(60, 32)
        self.rnn = torch.nn.RNN(32, 128, batch_first=True)
        self.linear = torch.nn.Linear(128, len_a)

    def forward(self, sentence, state=None):
        x = self.embed(sentence)
        out, hidden = self.rnn(x)
        return self.linear(out)

In [7]:
model = RNNModel()
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [8]:
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc, iter_num = .0, .0, .0
    start_epoch_time = time.time()
    model.train()
    for x, y in train:
        x = x
        y = y.view(1, -1).squeeze()
        optimizer.zero_grad()
        out = model.forward(x).view(-1, len_a)
        l = loss(out, y)
        train_loss += l.item()
        batch_acc = (out.argmax(dim=1) == y)
        train_acc += batch_acc.sum().item() / batch_acc.shape[0]
        l.backward()
        optimizer.step()
        iter_num += 1
    print(f"Epoch: {epoch+1}, loss: {train_loss:.4f}, acc: "
        f"{train_acc / iter_num:.4f}",
        end=" | ")
    test_loss, test_acc, iter_num = .0, .0, .0
    model.eval()
    for x, y in test:
        x = x
        y = y.view(1, -1).squeeze()
        out = model.forward(x).view(-1, len_a)
        l = loss(out, y)
        test_loss += l.item()
        batch_acc = (out.argmax(dim=1) == y)
        test_acc += batch_acc.sum().item() / batch_acc.shape[0]
        iter_num += 1
    print(
        f"test loss: {test_loss:.4f}, test acc: {test_acc / iter_num:.4f} | "
        f"{time.time() - start_epoch_time:.2f} sec."
    )

Epoch: 1, loss: 1401.8447, acc: 0.8596 | test loss: 104.4891, test acc: 0.9653 | 19.99 sec.
Epoch: 2, loss: 293.8605, acc: 0.9686 | test loss: 53.0091, test acc: 0.9731 | 20.13 sec.
Epoch: 3, loss: 183.7583, acc: 0.9774 | test loss: 37.7654, test acc: 0.9816 | 19.40 sec.
Epoch: 4, loss: 138.1617, acc: 0.9838 | test loss: 29.4085, test acc: 0.9870 | 19.38 sec.
Epoch: 5, loss: 109.4519, acc: 0.9882 | test loss: 23.6644, test acc: 0.9914 | 20.07 sec.
Epoch: 6, loss: 88.7500, acc: 0.9923 | test loss: 19.4287, test acc: 0.9943 | 18.05 sec.
Epoch: 7, loss: 73.1692, acc: 0.9942 | test loss: 16.2540, test acc: 0.9948 | 17.76 sec.
Epoch: 8, loss: 61.2964, acc: 0.9957 | test loss: 13.7912, test acc: 0.9962 | 18.00 sec.
Epoch: 9, loss: 52.1300, acc: 0.9967 | test loss: 11.8612, test acc: 0.9968 | 18.02 sec.
Epoch: 10, loss: 44.9032, acc: 0.9972 | test loss: 10.3360, test acc: 0.9972 | 18.04 sec.
Epoch: 11, loss: 39.1301, acc: 0.9974 | test loss: 9.0843, test acc: 0.9972 | 17.94 sec.
Epoch: 12, lo

In [9]:
sentence = 'Imagine a huge seaweed farm the size of Croatia floating in the South Atlantic between Africa and South America.'

encrypted_sentence = cipher.encrypt(sentence)
encrypted_sentence_idx = sent_to_index(encrypted_sentence)

result = model(torch.tensor([encrypted_sentence_idx])).argmax(dim=2)
predicted_sentence = "".join([alphabet[i.item()] for i in result.flatten()])

print(f'Encrypted sentence is : \n{encrypted_sentence}')
print('\n' + "*" * 40 + '\n')
print(f'Decrypted sentence: \n{cipher.decrypt(encrypted_sentence)}')
print('\n' + "*" * 40 + '\n')
print(f'Predicted sentence: \n{predicted_sentence}')

Encrypted sentence is : 
NrflnsjefemzljexjfBjjiekfwreymjexnEjetkeHwtfynfekqtfynslenseymjeXtzymeFyqfsynhegjyBjjseFkwnhfefsieXtzymeFrjwnhfb

****************************************

Decrypted sentence: 
Imagine a huge seaweed farm the size of Croatia floating in the South Atlantic between Africa and South America.

****************************************

Predicted sentence: 
Imagine a huge seaweed farm the size of Croatia floating in the South Atlantic between Africa and South America.
