<a href="https://colab.research.google.com/github/GGpark1/Deep_Learning_Study/blob/master/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import shutil
import zipfile

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
#파일 import

http = urllib3.PoolManager()
url ='http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:       
    shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
#df 구성

lines = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
print('전체 샘플의 개수 :',len(lines))

전체 샘플의 개수 : 192341


In [4]:
lines.head()

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,Bouge !
3,Hi.,Salut !
4,Hi.,Salut.


In [5]:
lines = lines[0:60000]

In [6]:
lines.sample(10)

Unnamed: 0,src,tar
37268,Spread your fingers.,Écarte les doigts.
25386,That is essential.,C'est essentiel.
20707,Tom is too young.,Tom est trop jeune.
15184,One wonders why.,On se demande pourquoi.
55202,I didn't want anything.,Je ne voulais rien.
25922,This is priceless.,C'est hors de prix.
47625,He didn't see a thing.,Il ne vit rien.
52282,We need to study more.,Il faut qu’on étudie plus.
49053,I remember it vividly.,J'en garde un souvenir très vivace.
40084,Did you kill anybody?,As-tu tué qui que ce soit ?


In [10]:
#디코딩 문장의 앞 뒤에 시작과 종료 표시

lines.tar = lines.tar.apply(lambda x : '\t ' + x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
25722,They hunted foxes.,\t Ils chassaient des renards. \n
10268,I had fun here.,"\t Je me suis amusée, ici. \n"
197,Call us.,\t Appelle-nous ! \n
41626,I had only one drink.,\t Je n'ai pris qu'un verre. \n
5829,They said no.,\t Ils ont dit non. \n
47114,Do you like his songs?,\t Aimes-tu ses chansons ? \n
18800,I was born there.,\t J'y suis né. \n
56061,I told him to be quiet.,\t Je lui ai dit de se taire. \n
10078,How tall is he?,\t Quelle taille fait-il ? \n
30430,It's crowded today.,\t C'est plein aujourd'hui. \n


In [12]:
#문자 집합 구축

src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [16]:
#문자 집합 사이즈 정의

src_vocab_size = len(src_vocab) + 1
tar_vocab_size = len(tar_vocab) + 1

print(src_vocab_size)
print(tar_vocab_size)

80
105


In [19]:
#문자 집합 리스트화

src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))

print(src_vocab[45:75])
print(tar_vocab[0:10])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['\t', '\n', ' ', '!', '"', '$', '%', '&', "'", '(']


In [20]:
#문자에 인덱스 번호 부여
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])

In [21]:
#훈련 데이터 정수 인코딩

encoder_input = []

#문장 1개
for line in lines.src:
    encoded_line = []
    # 각 줄에서 1개의 char
    for char in line:
        #각 char을 정수로 변환
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)

print(encoder_input[:5])

[[30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10], [31, 58, 10]]


In [22]:
#테스트 데이터 정수 인코딩

decoder_input = []
for line in lines.tar:
    encoded_line = []
    for char in line:
        encoded_line.append(tar_to_index[char])
    decoder_input.append(encoded_line)

print(decoder_input[:5])

[[1, 3, 48, 53, 3, 4, 3, 2], [1, 3, 39, 53, 70, 55, 60, 57, 14, 3, 2], [1, 3, 28, 67, 73, 59, 57, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 14, 3, 2]]


In [23]:
#디코더 출력값과 비교할 실제값 
decoder_target = []
for line in lines.tar:
    timestep = 0
    encoded_line = []
    for char in line:
        if timestep > 0:
            encoded_line.append(tar_to_index[char])
        timestep = timestep + 1
    decoder_target.append(encoded_line)
print(decoder_target[:5])

[[3, 48, 53, 3, 4, 3, 2], [3, 39, 53, 70, 55, 60, 57, 14, 3, 2], [3, 28, 67, 73, 59, 57, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 14, 3, 2]]


In [24]:
#패딩 처리를 위한 데이터 샘플 길이 확인

max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])

print(max_src_len)
print(max_tar_len)

23
76


In [25]:
#패딩

encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [26]:
#학습을 위해 정수 인코딩을 원핫인코딩으로 전환

encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)