# Senquence-to-Sequence

In [2]:
import os
import shutil
import zipfile

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [6]:
urllib = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilepath = os.path.join(path, filename)

with urllib.request('GET',url,preload_content=False) as r, open(zipfilepath, 'wb') as out_file:
    shutil.copyfileobj(r, out_file)
    
with zipfile.ZipFile(zipfilepath, 'r') as zip_ref:
    zip_ref.extractall(path)

BadZipFile: File is not a zip file

In [9]:
urllib = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)

with urllib.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:
  shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
  zip_ref.extractall(path)

In [10]:
lines = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
print('전체 샘플의 개수 :',len(lines))

전체 샘플의 개수 : 192341


In [13]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # 6만개만 저장
lines.sample(10)

Unnamed: 0,src,tar
37097,She called for help.,Elle a appelé à l'aide.
7033,I am very sad.,Je suis très triste.
16988,You've been had.,Vous vous êtes fait avoir.
36655,Jesus answered them.,Jésus leur répondit.
48108,How's that your fault?,En quoi est-ce que c'est ta faute ?
44137,That's not happening!,Cela n'arrivera pas !
42116,I still don't get it.,Je ne capte toujours pas.
29907,I'll stay involved.,Je resterai impliqué.
20752,Tom kept texting.,Tom continuait à envoyer des SMS.
55292,I don't regret a thing.,Je ne regrette rien.


In [14]:
lines['tar']

0                                    Va !
1                                 Marche.
2                                 Bouge !
3                                 Salut !
4                                  Salut.
                       ...               
59995        Pourquoi feraient-elles ça ?
59996      Pourquoi demanderais-tu cela ?
59997    Pourquoi demanderiez-vous cela ?
59998           Pourquoi dirais-tu cela ?
59999         Pourquoi diriez-vous cela ?
Name: tar, Length: 60000, dtype: object

In [15]:
lines['tar'] =lines['tar'].apply(lambda x : '\t '+ x +' \n') # 문장의 시작 끝 표시
lines.sample(10)

Unnamed: 0,src,tar
26508,We agree with you.,\t Nous sommes d’accord avec vous. \n
2619,Who's that?,\t C’est qui ? \n
57446,Nobody can replace Tom.,\t Personne ne peut remplacer Tom. \n
25074,Please forgive me.,\t Pardonne-moi s'il te plait. \n
31648,They're mad at you.,\t Elles sont en colère après toi. \n
37956,This isn't my money.,\t Ce n'est pas mon argent. \n
26812,We've been warned.,\t Nous avons été prévenus. \n
56552,I'm not busy right now.,\t Je ne suis pas occupé pour le moment. \n
12758,You're too old.,\t Vous êtes trop vieux. \n
41962,I must work tomorrow.,\t Je dois travailler demain. \n


In [16]:
# 문자 집합 구축
src_vocab = set()
for line in lines['src']:  # 1줄씩 읽음
    for char in line:  # 1개의 문자씩 읽음
        src_vocab.add(char)

In [18]:
# 단어 단위보다 char 단위로 했을 시 훨씬 작음
src_vocab

{' ',
 '!',
 '"',
 '$',
 '%',
 '&',
 "'",
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '°',
 'é',
 '’',
 '€'}

In [19]:
tar_vocab = set()

for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [20]:
tar_vocab

{'\t',
 '\n',
 ' ',
 '!',
 '"',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\xa0',
 '«',
 '»',
 'À',
 'Ç',
 'É',
 'Ê',
 'Ô',
 'à',
 'â',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'î',
 'ï',
 'ô',
 'ù',
 'û',
 'œ',
 '\u2009',
 '\u200b',
 '‘',
 '’',
 '\u202f'}

In [21]:
src_vocab_size = len(src_vocab) +1
tar_vocab_size = len(tar_vocab) +1

print(src_vocab_size)
print(tar_vocab_size)

80
105


In [25]:
# 정렬하여 순서를 정해줘야 인덱스를 사용할 수 있다
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[65:75])
print(tar_vocab[65:75])

['q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [None]:
# 각 문자에 인덱스 부여하기

src_to_index = dict