In [77]:
#file preprocessing
from io import open
import unicodedata
import re
from sklearn.model_selection import train_test_split

#creating model
import random
import torch
import torch.nn as nn
import copy
import time

#score calculating
from nltk.translate.bleu_score import sentence_bleu

#warnings
import warnings

#default settings
warnings.filterwarnings("ignore")
%matplotlib inline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [78]:
DATA_DIR = 'data/eng-nld/eng-nld.txt'
RANDOM_STATE = 39
HIDDEN_SIZE = 40
MAX_LENGTH = 50
CHARLIST = list('abcdefghijklmnopqrstuvwxyz.?! ') 
INDEX2CHAR =['none'] + [c for c in CHARLIST]
CHAR2INDEX = {w: i for i, w in enumerate(INDEX2CHAR)}

In [79]:
def caesarCipher(text, k = 6):
    cipher_text = copy.deepcopy(text)
    for i, sentence in enumerate(cipher_text):
        for j, letter in enumerate(sentence):
            cipher_text[i][j] = INDEX2CHAR[(CHAR2INDEX.get(letter,CHAR2INDEX['none']) + k) % len(INDEX2CHAR)]

    return cipher_text, text

In [80]:
def unicodeToAscii(sentence):
    return ''.join(
        c for c in unicodedata.normalize('NFD', sentence)
        if unicodedata.category(c) != 'Mn'
    )

def cleanText(sentence):
    sentence = unicodeToAscii(sentence.lower().strip()) 
    sentence = re.sub(r"([.!?])", r" \1", sentence)
    sentence = re.sub(r"[^a-zA-Z.!?]+", r" ", sentence)
    return sentence

In [81]:
def readLangs():
    print("Reading lines...")
    lines = open(DATA_DIR , encoding='utf-8').read().strip().split('\n')
    text = [[token for token in cleanText(l.split('\t')[0])] for l in lines]
    
    return text

In [82]:
def filterPair(sentence):
    return len(sentence) < MAX_LENGTH

def filterPairs(text):
    return [sentence for sentence in text if filterPair(sentence)]

In [83]:
def vectorizeText(text):
    vectorized_text = torch.zeros((len(text), MAX_LENGTH), dtype=int,device=device)
    
    for i,sentence in enumerate(text):
        for j, letter in enumerate(sentence):
            if j >= MAX_LENGTH:
                break
            vectorized_text[i, j] = CHAR2INDEX.get(letter,CHAR2INDEX['none'])
    
    return vectorized_text

In [84]:
def devectorizeText(vectorize_cipher):
    devectorize_text = []
    for word in vectorize_cipher:
        devectorize_text.append([INDEX2CHAR[letter.argmax()] for letter in word])

    return devectorize_text

In [85]:
def prepareData():
    text = readLangs()
    print("Read %s sentence" % len(text))
    text = filterPairs(text)
    print("Trimmed to %s sentence" % len(text))
    source, target = caesarCipher(text)
    print("Done")
    
    return source, target

In [86]:
class CipherModule(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(CipherModule,self).__init__()
        
        self.embedding = nn.Embedding(input_size,hidden_size)
        self.rnn = nn.RNN(HIDDEN_SIZE,128,batch_first=True)
        self.linear = nn.Linear(128,input_size)
    
    def forward(self,sentence):
        embedding = self.embedding(sentence)
        output,hidden = self.rnn(embedding)
        return self.linear(output)
        

In [87]:
source, target = prepareData()

Reading lines...
Read 75298 sentence
Trimmed to 71491 sentence
Done


In [88]:
X_train,X_test,y_train,y_test = train_test_split(source,target,test_size=0.05,shuffle=False,random_state=RANDOM_STATE)

In [89]:
X_train,y_train = vectorizeText(X_train),vectorizeText(y_train)

In [90]:
X_train[0],y_train[0]

(tensor([ 9, 17,  1, 29,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        device='cuda:0'),
 tensor([ 7, 15, 30, 27,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        device='cuda:0'))

In [91]:
model = CipherModule(len(INDEX2CHAR),HIDDEN_SIZE).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.05)

In [92]:
def train(model,optimizer,criterion,source,target,epochs,batch_size=1000,print_every=10):

    for epoch in range(epochs):
        start = time.time()
        train_loss = 0.
        train_passed = 0
        
        model.train()
        for i in range(0,len(source),batch_size):
            optimizer.zero_grad()
            output = model.forward(source[i:i+batch_size])
            output = output.view(-1, len(INDEX2CHAR))
            loss = criterion(output,target[i:i+batch_size].flatten())
            
            loss.backward()
            optimizer.step()
            
            train_loss +=loss.item()
            train_passed += 1

        if epoch % print_every == 0:
            print("Epoch {}. Time: {:.3f}, Loss: {:.3f}".format(epoch, time.time() - start, train_loss / train_passed))
            

In [93]:
train(model,optimizer,criterion,X_train,y_train,100)

Epoch 0. Time: 0.680, Loss: 0.326
Epoch 10. Time: 0.790, Loss: 0.000
Epoch 20. Time: 0.771, Loss: 0.000
Epoch 30. Time: 0.742, Loss: 0.000
Epoch 40. Time: 0.741, Loss: 0.000
Epoch 50. Time: 0.744, Loss: 0.000
Epoch 60. Time: 0.735, Loss: 0.000
Epoch 70. Time: 0.731, Loss: 0.000
Epoch 80. Time: 0.700, Loss: 0.000
Epoch 90. Time: 0.735, Loss: 0.000


In [94]:
def evaluate(model, cipher_text):
    model.eval()
    vectorize_cipher = vectorizeText(cipher_text)
    vectorize_cipher = model(vectorize_cipher)
    devectorize_cipher = devectorizeText(vectorize_cipher)
    
    return [phrase[:phrase.index('none')] if 'none' in phrase else phrase for phrase in devectorize_cipher]

In [95]:
def evaluateRandomly(model,n=10):
    for i in range(n):
        sentence = [random.choice(source)]
        print('>', [''.join(sent) for sent in sentence][0])
        output_letters = evaluate(model,sentence)
        output_sentence = [''.join(sentence) for sentence in output_letters]
        print('<', output_sentence[0])
        print('')

In [96]:
evaluateRandomly(model)

> ygtga.qwavjgtga 
< were you there ?

> kavjkpmakaurgcmahtgpejadgvvgtavjcpavqoafqgua!
< i think i speak french better than tom does .

> kafqpavajcxgacadcvjkpiauwkva!
< i don t have a bathing suit .

> ngvaogajcxgacayqtfaykvjavqoa!
< let me have a word with tom .

> jgauavjgadq.aygaurqmgacdqwvavjgaqvjgtafc.a!
< he s the boy we spoke about the other day .

> vjgalcrcpgugageqpqo.aitgyad.ancuva.gcta!
< the japanese economy grew by last year .

> ygatgarjqvqitcrjgtua!
< we re photographers .

> .qwapgctn.adtqmgao.alcya!
< you nearly broke my jaw .

> vjgahkujgtogpaiqvawradghqtgafcypa!
< the fishermen got up before dawn .

> vjgtgauapqvjkpiatqocpvkeadgvyggpawua!
< there s nothing romantic between us .


In [104]:
def calculate_score(model,source,target):
    bleu_scores = []
    reference = target
    for i in range(len(source)):
        candidate =  evaluate(model,[source[i]])[0] 
        
        score = sentence_bleu(reference,candidate)
        bleu_scores.append(score)
    
    return (sum(bleu_scores)/len(source))*100
        

In [98]:
print('BLEU score: {} %'.format(round(calculate_score(model,X_test,y_test)),2))

BLEU score: 100 %
