<a href="https://colab.research.google.com/github/DmitryKutsev/eng_to_jap_translator/blob/main/last_attn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [178]:
!pip install tinysegmenter



*Kurohashi-Kawahara Lab. has the copyright of Japanese Basic Sentence Data, and NICT MASTAR Project, Multilingual Translation Lab. has the copyright of English and Chinese Basic Sentence Data. You can use all the data under the terms of the Creative Commons Attribution 3.0 Unported license.
     http://nlp.ist.i.kyoto-u.ac.jp/EN/?JEC%20Basic%20Sentence%20Data*

In [179]:
import math
import numpy as np
import pandas as pd
import random
import json
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from matplotlib import pyplot as plt
from math import ceil
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate import bleu_score
from io import open
import unicodedata
import string
import re
import random
import spacy
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import tinysegmenter

In [180]:
spacy_en = spacy.load('en')

In [181]:
segmenter = tinysegmenter.TinySegmenter()

In [182]:
device = torch.device('cuda:0')

In [183]:
my_frame = pd.read_excel('http://nlp.ist.i.kyoto-u.ac.jp/EN/?plugin=attach&refer=JEC%20Basic%20Sentence%20Data&openfile=JEC_basic_sentence_v1-2.xls')

In [184]:
#remove Chineese column
my_frame = my_frame.drop(['难道不会是X吗，我实在是感到怀疑。'], axis=1)
my_frame.columns = ['index', 'jp', 'en']
my_frame = my_frame.drop(['index'], axis=1)

In [185]:
for i in range(len(my_frame)):
  my_frame['en'][i] = my_frame['en'][i].lower()

In [186]:
my_frame

Unnamed: 0,jp,en
0,Xがいいなといつも思います,i always think x would be nice.
1,それがあるようにいつも思います,it always seems like it is there.
2,それが多すぎないかと正直思う,i honestly feel like there is too much.
3,山田はみんなに好かれるタイプの人だと思う,i think that yamada is the type everybody likes.
4,〜と誰かが思った,someone thought that 〜
...,...,...
5298,チームが４人のメンバーで構成されています,the team consists of four members.
5299,彼が実際に動画を再生する,he actually plays the video.
5300,政府が銀行に公的資金をどんどん投入しました,the government injected massive public funds i...
5301,レベル１の機能に下記の機能をプラスする,the following will be added to the level 1 fun...


In [187]:
segmenter.tokenize(my_frame['jp'][0])

['X', 'が', 'いい', 'な', 'といつも', '思い', 'ます']

In [188]:
[tok.text for tok in spacy_en.tokenizer(my_frame['en'][1])]

['it', 'always', 'seems', 'like', 'it', 'is', 'there', '.']

In [189]:
for word in spacy_en.tokenizer(my_frame['en'][1]):
  print(word)

it
always
seems
like
it
is
there
.


In [190]:
valid_border = ceil(len(my_frame)*0.8)

In [191]:
train_df = my_frame[:valid_border]
valid_df = my_frame[valid_border:]
len(train_df), len(valid_df)
valid_df = valid_df.reset_index()

In [192]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [193]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


# def normalizeString(s):
#     # s = unicodeToAscii(s.lower().strip())
#     s = re.sub(r"([.!?])", r" \1", s)
#     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
#     return s

In [194]:
    for index, sent in enumerate(train_df['jp']):
      if index == 1:
        print(index, sent)
      pair = [str(spacy_en.tokenizer(train_df['en'][index].lower())), ' '.join(segmenter.tokenize(sent))]
      if index == 1:
        print(pair)

1 それがあるようにいつも思います
['it always seems like it is there.', 'それ が ある よう にいつも 思い ます']


In [195]:
def readLangs(lang1, lang2, frame, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    # lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
    #     read().strip().split('\n')
    pairs = []
    # Split every line into pairs and normalize
    for index, sent in enumerate(frame['jp']):
      pair = [str(spacy_en.tokenizer(frame['en'][index].lower())), ' '.join(segmenter.tokenize(sent))]
      pairs.append(pair)

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [196]:
MAX_LENGTH = 20


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH
        # p[0].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [197]:

def prepareData(lang1, lang2, frame, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, frame, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('en', 'jp', my_frame, False)
print(random.choice(pairs))

Reading lines...
Read 5303 sentence pairs
Trimmed to 5220 sentence pairs
Counting words...
Counted words:
en 7329
jp 6988
["he didn't actually use the computer.", '彼 が パソコン を 実際 に は 使用 し なかっ た']


In [117]:
_, _, train_pairs = prepareData('en', 'jp', train_df, False)
print(random.choice(train_pairs))


Reading lines...
Read 4243 sentence pairs
Trimmed to 4182 sentence pairs
Counting words...
Counted words:
en 6237
jp 5656
['my thinking did a 180.', '１ ８ ０ 度 自分 の 考え方 が 変わり まし た']


In [118]:
_, _, valid_pairs = prepareData('en', 'jp', valid_df, False)
print(random.choice(train_pairs))

Reading lines...
Read 1060 sentence pairs
Trimmed to 1038 sentence pairs
Counting words...
Counted words:
en 2827
jp 2820
['we will spend 48 minutes in the darkness.', '私 たち は 暗闇 の 中 で ４ ８ 分 もの 時間 を 過ごす']


In [119]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [120]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [121]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [122]:
def indexesFromSentence(lang, sentence):
    res = []
    
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [123]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    sentence = sentence.lower()
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [124]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, 
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [125]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [126]:
try:
  for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)
except Exception as e:
  pass

In [127]:
def trainIters(encoder, decoder, epochs, print_every=500, 
               plot_every=100, learning_rate=0.01):
    start = time.time()
    # plot_losses = []
    list_losses = []
    loss_total = 0
      # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    local_pairs = [tensorsFromPair(pair)
                      for pair in train_pairs]
    criterion = nn.NLLLoss()

    
    for e in range(epochs):
      progress_bar = tqdm(total=epochs, desc=f'{ e }')
      for iter in range(len(local_pairs) - 1):

          training_pair = local_pairs[iter]
          input_tensor = training_pair[0]
          target_tensor = training_pair[1]

          loss = train(input_tensor, target_tensor, encoder,
                      decoder, encoder_optimizer, decoder_optimizer, criterion)
          loss_total += loss
          plot_loss_total += loss
          list_losses.append(loss)

          progress_bar.set_postfix(loss=np.mean(list_losses[-print_every:]),
                                  perplexity=np.exp(np.mean(list_losses[-print_every:])))
          progress_bar.update()
      
      progress_bar.close()

In [128]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [129]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [198]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.6).to(device)

In [199]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.6).to(device)

trainIters(encoder1, attn_decoder1, 15)

0: 4181it [01:56, 35.75it/s, loss=4.63, perplexity=102]
1: 4181it [01:57, 35.55it/s, loss=4.41, perplexity=82.7]
2: 4181it [01:58, 35.24it/s, loss=4.23, perplexity=68.9]
3: 4181it [02:00, 34.81it/s, loss=3.95, perplexity=51.9]
4: 4181it [02:00, 34.64it/s, loss=3.65, perplexity=38.3]
5: 4181it [02:01, 34.55it/s, loss=3.4, perplexity=30]
6: 4181it [02:01, 34.32it/s, loss=3.07, perplexity=21.5]
7: 4181it [02:02, 34.22it/s, loss=2.86, perplexity=17.5]
8: 4181it [02:02, 34.10it/s, loss=2.59, perplexity=13.4]
9: 4181it [02:04, 33.47it/s, loss=2.4, perplexity=11]
10: 4181it [02:06, 33.18it/s, loss=2.14, perplexity=8.49]
11: 4181it [02:06, 32.99it/s, loss=1.96, perplexity=7.13]
12: 4181it [02:06, 33.00it/s, loss=1.91, perplexity=6.78]
13: 4181it [02:09, 32.31it/s, loss=1.81, perplexity=6.13]
14: 4181it [02:06, 33.00it/s, loss=1.7, perplexity=5.48]


In [200]:
phrase, tenz = evaluate(encoder1, attn_decoder1, "I want to kill")
' '.join(phrase[:-1])

'私 が が を を だ'

In [201]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShow(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))



evaluateAndShow("I want to eat")
evaluateAndShow("I want to kill")
evaluateAndShow("I want to drink")
evaluateAndShow("I kill you")

input = I want to eat
output = 楽しみ が 思わず を 心から に <EOS>
input = I want to kill
output = 私 が が を を ん <EOS>
input = I want to drink
output = 私 が が を を し <EOS>
input = I kill you
output = 私 が が の が が まし まし まし まし まし まし まし まし まし まし まし まし まし まし


In [202]:
train_pairs[0]

['i always think x would be nice.', 'X が いい な といつも 思い ます']

In [203]:
def cal_bleu_score(dataset_pairs, encoder, decoder):
    targets = []
    predictions = []
 
    for i in range(len(dataset_pairs)):
        target = dataset_pairs[i][1]
        # train_pairs
        predicted_words, _ = evaluate(encoder, decoder, dataset_pairs[i][0])
        predictions.append(' '.join(predicted_words[:-1]))
        targets.append(target)
    print(predictions[:3])
    print(targets[:3])
    print(f'BLEU Score: {round(corpus_bleu(predictions, targets) * 100, 2)}')


In [204]:
cal_bleu_score(train_pairs, encoder1, attn_decoder1)

['X が いい の といつも が ます ます ます', 'それ が 思い にいつも 思い 思い', 'それ が 多すぎ ない と 正直思 の 多すぎ を ない']
['X が いい な といつも 思い ます', 'それ が ある よう にいつも 思い ます', 'それ が 多すぎ ない か と 正直思 う']
BLEU Score: 78.97


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [205]:

cal_bleu_score(valid_pairs, encoder1, attn_decoder1)

['新た に は に に に の に 運営 し し て ます を ます する', '各 に に に に に の が が を を し まし', 'それ が は の 多く の を を し た て い ます']
['４月 から 個人 情報 保護 法 が 施行 さ れ まし た', '徐々 に 腎臓 の 機能 が 低下 し ます', '仕事 の 能力 が 低下 し た']
BLEU Score: 66.25


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [206]:
train_pairs[0]

['i always think x would be nice.', 'X が いい な といつも 思い ます']

In [210]:
evaluateAndShow("i think it would be nice ")

input = i think it would be nice 
output = 自分 が が が に <EOS>
