<a href="https://colab.research.google.com/github/110805/Spelling_Correction/blob/master/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/110805/Spelling_Correction.git
%cd Spelling_Correction/

Cloning into 'Spelling_Correction'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 32 (delta 15), reused 12 (delta 4), pack-reused 0[K
Unpacking objects: 100% (32/32), done.
/content/Spelling_Correction


In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
from os import system
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from dataloader import sample_pair
from dataloader import Lang
import json



"""========================================================================================
The sample.py includes the following template functions:

1. Encoder, decoder
2. Training function
3. BLEU-4 score function

You have to modify them to complete the lab.
In addition, there are still other functions that you have to 
implement by yourself.

1. Your own dataloader (design in your own way, not necessary Pytorch Dataloader)
2. Output your results (BLEU-4 score, correction words)
3. Plot loss/score
4. Load/save weights
========================================================================================"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = 0
EOS_token = 1
#----------Hyper Parameters----------#
hidden_size = 256
vocab_size = 17703 #The number of words in vocabulary
teacher_forcing_ratio = 0.7
LR = 0.05
MAX_LENGTH = 10

################################
#Example inputs of compute_bleu
################################
#The target word
reference = 'variable'
#The word generated by your model
output = 'varable'

#compute BLEU-4 score
def compute_bleu(output, reference):
    cc = SmoothingFunction()
    if len(reference) == 3:
        weights = (0.33,0.33,0.33)
    else:
        weights = (0.25,0.25,0.25,0.25)
    return sentence_bleu([reference], output,weights=weights,smoothing_function=cc.method1)

#Encoder
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))

#Decoder
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    #----------sequence to sequence part for encoder----------#
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        #encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
	
    #----------sequence to sequence part for decoder----------#
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))



def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    with open('train.json') as f:
        voc = json.load(f)

    lang = Lang()
    lang.addWord(voc)
    training_pairs = [sample_pair(lang, i) for i in range(7461)]
    print('Finish sampling')
    criterion = nn.CrossEntropyLoss()
    
    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))
            
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    plt.figure(1)
    plt.plot(range(int(n_iters/plot_every)), plot_losses)
    plt.xlabel('Iterations*100')
    plt.ylabel('CrossEntropyLoss')
    plt.savefig('TrainingLoss')

encoder1 = EncoderRNN(vocab_size, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, vocab_size).to(device)
trainIters(encoder1, decoder1, 5, print_every=5000)
torch.save(encoder1.state_dict(), 'encoder.pkl')
torch.save(decoder1.state_dict(), 'decoder.pkl')


In [23]:
def evaluate(encoder, decoder, lang, input_string, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = []
        input_tensor.append(lang.word2index[input_string])
        input_tensor.append(EOS_token)
        
        input_tensor = torch.tensor(input_tensor, dtype=torch.long).view(-1, 1)
    
        input_tensor = input_tensor.to(device)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)

            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                break
            else:
                decoded_words.append(lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

def evalTestdata():
    with open('train.json') as f:
        voc = json.load(f)

    lang = Lang()
    lang.addWord(voc)
    score = 0
    with open('test.json') as f:
        voc = json.load(f)
    
    for data in voc:
        lang.add(data['input'][0])

    for data in voc:
        evaluate(encoder1, decoder1, lang, data['input'][0])
        
        output = evaluate(encoder1, decoder1, lang, data['input'][0])
        print('input: {}'.format(data['input'][0]))
        print('target: {}'.format(data['target']))
        print('pred: {}'.format(output))
        
        if len(output) != 0:
            score += compute_bleu(output[0], data['target'])
        else:
            score += compute_bleu('', data['target']) # predict empty string
        
        print('--------------------')
    print('BLEU-4 score:{}'.format(score/50))
    
evalTestdata()

input: contenpted
target: contented
pred: ['vegetables']
--------------------
input: begining
target: beginning
pred: ['facilitated']
--------------------
input: problam
target: problem
pred: ['says']
--------------------
input: dirven
target: driven
pred: ['searching']
--------------------
input: ecstacy
target: ecstasy
pred: ['elsewhere']
--------------------
input: juce
target: juice
pred: ['sound']
--------------------
input: localy
target: locally
pred: ['locally']
--------------------
input: compair
target: compare
pred: ['compare']
--------------------
input: pronounciation
target: pronunciation
pred: ['pronunciation']
--------------------
input: transportibility
target: transportability
pred: ['transportability']
--------------------
input: miniscule
target: minuscule
pred: ['minuscule']
--------------------
input: independant
target: independent
pred: ['independent']
--------------------
input: aranged
target: arranged
pred: ['monetary']
--------------------
input: poartry
tar