# Character-Level Neural Machine Translation
- https://wikidocs.net/24996

## Import

In [1]:
import pandas as pd
import urllib3
import zipfile
import shutil
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

## Load data

In [2]:
http = urllib3.PoolManager()
url ='http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:       
    shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
lines = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
len(lines)

178009

In [4]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # 6만개만 저장
lines.sample(10)

Unnamed: 0,src,tar
39086,We're still in shock.,Nous sommes encore sous le choc.
8133,I like walking.,J'aime marcher.
50070,Just tell us the truth.,Dis-nous simplement la vérité.
23437,Get in the car now.,"Monte dans la voiture, maintenant !"
21164,The joke's on you.,C'est de vous qu'on se moque.
35506,I forget who said it.,J'ai oublié qui l'a dit.
46190,You have to be strong.,Il te faut être fort.
45467,We will not surrender.,Nous ne nous rendrons pas.
18291,Do you guys smoke?,Est-ce que vous fumez?
41463,How is your new class?,Comment est votre nouvelle classe ?


## Preprocessing

In [5]:
lines.tar = lines.tar.apply(lambda x : '\t '+ x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
58794,We have to wait for him.,\t Nous devons l'attendre. \n
57498,That applies to him too.,\t Cela s'applique aussi à lui. \n
49061,I trust him completely.,\t Je lui fais totalement confiance. \n
48083,I came with my friends.,\t Je vins avec mes amis. \n
34624,He cannot be trusted.,\t On ne peut pas lui faire confiance. \n
51157,They are all very busy.,\t Ils sont tous très occupés. \n
347,I'm bald.,\t Je suis chauve. \n
34019,"Come on, let's do it.","\t Allez, faisons-le ! \n"
31648,Thanks for the food.,\t Merci pour la bouffe. \n
20142,Is somebody there?,\t Y a-t-il quelqu'un ? \n


In [6]:
# 글자 집합 구축
src_vocab=set()
for line in lines.src: # 1줄씩 읽음
    for char in line: # 1개의 글자씩 읽음
        src_vocab.add(char)

tar_vocab=set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [7]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)

79
106


In [8]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']
