<a href="https://colab.research.google.com/github/DmitryKutsev/eng_to_jap_translator/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install tinysegmenter



In [28]:
import sys
import os
import math
from tqdm import tqdm

import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np

import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import random
import spacy
import tinysegmenter

import torch
import torch.nn as nn
import random


In [29]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
spacy_en = spacy.load('en')

In [17]:
segmenter = tinysegmenter.TinySegmenter()

In [21]:
my_frame = pd.read_excel(
'http://nlp.ist.i.kyoto-u.ac.jp/EN/?plugin=attach&refer=JEC%20Basic%20Sentence%20Data&openfile=JEC_basic_sentence_v1-2.xls')

In [22]:
#remove Chineese column
my_frame = my_frame.drop(['难道不会是X吗，我实在是感到怀疑。'], axis=1)
my_frame.columns = ['index', 'jp', 'en']
my_frame = my_frame.drop(['index'], axis=1)

In [23]:
my_frame

Unnamed: 0,jp,en
0,Xがいいなといつも思います,I always think X would be nice.
1,それがあるようにいつも思います,It always seems like it is there.
2,それが多すぎないかと正直思う,I honestly feel like there is too much.
3,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.
4,〜と誰かが思った,Someone thought that 〜
...,...,...
5298,チームが４人のメンバーで構成されています,The team consists of four members.
5299,彼が実際に動画を再生する,He actually plays the video.
5300,政府が銀行に公的資金をどんどん投入しました,The government injected massive public funds i...
5301,レベル１の機能に下記の機能をプラスする,The following will be added to the level 1 fun...


In [24]:
segmenter.tokenize(my_frame['jp'][1])

['それ', 'が', 'ある', 'よう', 'にいつも', '思い', 'ます']

In [31]:
[tok.text for tok in spacy_en.tokenizer(my_frame['en'][1])]

['It', 'always', 'seems', 'like', 'it', 'is', 'there', '.']

In [33]:
my_frame.to_csv('my_frame.csv', index=False)  

In [34]:
!ls

my_frame.csv  sample_data


In [36]:
def tokenize_jp(text):
    """
    Tokenizes JP text from a string into a list of strings
    """
    return segmenter.tokenize(text)

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [37]:
SRC = Field(tokenize=tokenize_jp, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [38]:
dataset = TabularDataset(path='my_frame.csv', 
                         format='csv', 
                         fields=[('en', SRC), ('jp', TRG)],
                         skip_header=True)

In [39]:
train_dt, valid_dt, test_dt = dataset.split(split_ratio=[0.7, 0.1, 0.2], 
                                            random_state=random.getstate())

In [40]:
SRC.build_vocab(train_dt, min_freq=2)
TRG.build_vocab(train_dt, min_freq=2)

In [41]:
print (len(SRC.vocab), len(TRG.vocab))
print (SRC.vocab.freqs.most_common(10))
print (TRG.vocab.freqs.most_common(10))

2399 2406
[('が', 2865), ('の', 2473), ('を', 2320), ('に', 2011), ('ます', 1082), ('た', 1079), ('彼', 927), ('し', 680), ('は', 668), ('、', 524)]
[('.', 3477), ('the', 1641), ('He', 872), ('of', 648), ('to', 628), ('a', 615), ('in', 535), ('The', 498), ('I', 485), ('will', 435)]


In [43]:
batch_size = 32
train_it, valid_it, test_it = BucketIterator.splits((train_dt, valid_dt, test_dt),
                                                    batch_size=batch_size, 
                                                    sort_key=lambda x: len(x.jp), 
                                                    sort_within_batch=False, 
                                                    device=device)
