<a href="https://colab.research.google.com/github/ByronBlaze/nmt_jpn_eng/blob/main/NMT_ENG_JAP_1.2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import tensorflow as tf
from sklearn.model_selection import train_test_split
import re
import os
import io
import time


In [2]:
os.chdir('/content/drive/My Drive/Projects/NMT')

In [3]:
text_path = 'eng_jpn2.txt'

In [None]:
df = pd.read_table(text_path, names=['source', 'target', 'comments'])
df_new = df[['source', 'target']]
df_new.to_csv('eng_jpn2.txt',sep='\t',header= False, index=None)

In [4]:
text_data = pd.read_csv(text_path,sep = '\t',names=['source', 'target'])
#text_data = pd.read_csv(text_path, header=None, names=['source', 'target'])
text_data.sample(5)

Unnamed: 0,source,target
51847,Tom solved the puzzle after Mary gave him a fe...,メアリーにヒントをいくつか教えてもらってから、トムはそのパズルを解いた。
15740,He is a baseball player.,彼は野球選手だ。
35950,She glanced shyly at the young man.,彼女はその若者を恥ずかしそうにちらりと見た。
37406,The experiment confirmed his theory.,この実験で彼の学説はいっそう強固なものになった。
4056,I was too small.,私は小さすぎた。


In [None]:
text_data.drop(columns=['comments'], inplace=True)
text_data.sample(5)

Unnamed: 0,source,target
40176,Tom isn't what he was three years ago.,トムはもう３年前のトムじゃないんだ。
39369,He was involved in a traffic accident.,彼は交通事故に遭った。
32554,Is the coffee too strong for you?,コーヒーって、濃すぎた？
49944,"Beginning next week, we'll be using a new text...",来週から新しい教科書を使います。
36083,The children are playing with toys.,子供たちがおもちゃで遊んでいる。


In [None]:
np.savetxt(r'jpn_eng.txt', text_data.values, fmt='%s')

In [5]:
def clean_text(line):

  num_digits = str.maketrans('','', digits)
  line = line.lower()
  line = re.sub(" +", " ", line)
  line = re.sub("'", '', line)
  line = line.translate(num_digits)
  line = re.sub(r"([?.!,¿])", r" \1 ", line)
  line = line.rstrip().strip()
  line = 'START_ ' + line + ' _END'

  return line 


In [None]:
print(clean_text('text cleaning checker !!'))

START_ text cleaning checker  !  ! _END


In [6]:
def create_lang_pair(path,sample_size):
  
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[clean_text(l) for l in line.split('\t')] for line in lines[:sample_size]]

  return zip(*word_pairs)


In [7]:
sample_size = 60000
source, target = create_lang_pair(text_path, sample_size)

In [8]:
print(source[-1])
print(target[-1])
type(target)

START_ if someone who doesnt know your background says that you sound like a native speaker ,  it means they probably noticed something about your speaking that made them realize you werent a native speaker .  in other words ,  you dont really sound like a native speaker . _END
START_ 生い立ちを知らない人にネイティブみたいに聞こえるよって言われたら、それはおそらく、あなたの喋り方のどこかが、ネイティブじゃないと感じさせたってことだよ。つまりね、ネイティブのようには聞こえないということなんだよ。 _END


tuple

In [9]:
source_sentence_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_sentence_tokenizer.fit_on_texts(source)

In [10]:
source_tensor = source_sentence_tokenizer.texts_to_sequences(source)


In [11]:
source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_tensor, padding='post')

In [12]:
target_sentence_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(target)
target_tensor = source_sentence_tokenizer.texts_to_sequences(target)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post')


In [13]:
source_train_tensor, source_test_tensor, target_train_tensor, test_target_tensor = train_test_split(source_tensor, target_tensor, test_size=0.2)

In [16]:
BATCH_SIZE = 64
df = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(batch_size)
df = df.batch(batch_size, drop_remainder=True)

In [15]:
source_batch, target_batch = next(iter(df))
print(source_batch.shape)
print(target_batch.shape)

(64, 50)
(64, 5)


In [17]:
BUFFER_SIZE = len(source_train_tensor)
steps_per_epoch = len(source_train_tensor)//BATCH_SIZE
embedding_dim= 256
units =1024
source_vocab_size = len(source_sentence_tokenizer.word_index)+1
target_vocab_size = len(target_sentence_tokenizer.word_index)+1


In [22]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.encoder_units = encoder_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(encoder_units, return_sequences = True, return_state=True,recurrent_initializer ='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    return output, state
  
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.encoder_units))

In [23]:
encoder = Encoder(source_vocab_size, embedding_dim,units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(source_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))


Encoder output shape: (batch size, sequence length, units) (64, 50, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)
