In [None]:
#importar datasets de proves y splitter alemany
!pip install datasets evaluate --upgrade
!python -m spacy download de_core_news_sm

In [None]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate

Creem un dataset a partir del txt que serà un diccionari amb les claus dels idiomes (en i de) i com a valor una llista de les frases traduïdes en ordre

In [None]:
def eliminar_ccby(linea):
    indice_ccby = linea.find("CC-BY")
    if indice_ccby != -1:
        return linea[:indice_ccby]
    return linea

def separar_frases(linea):
    patron = r'(.+?[.!?])\s+([^A-Z]?[\s]+)?(.+)?'
    coincidencias = re.match(patron, linea)
    if coincidencias:
        frase1 = coincidencias.group(1)
        frase2 = coincidencias.group(3)
        return frase1.strip(), frase2.strip() if frase2 else None
    else:
        return None, None

def make_dataset(archivo_entrada):
    #english,deutsch = [],[]
    dataset = []
    with open(archivo_entrada, 'r') as f_in:
        for linea in f_in:
            linea_clean = eliminar_ccby(linea)
            a,b = separar_frases(linea_clean)
            dataset.append({'en':a,'de':b})
            #english.append(a)
            #deutsch.append(b)
    #dataset = {'en':english,'de':deutsch}
    return dataset

archivo_entrada = 'deu.txt'
data = make_dataset(archivo_entrada)

In [None]:
print(data[:10])

[{'en': 'Go.', 'de': 'Geh.'}, {'en': 'Hi.', 'de': 'Hallo!'}, {'en': 'Hi.', 'de': 'Grüß Gott!'}, {'en': 'Run!', 'de': 'Lauf!'}, {'en': 'Run.', 'de': 'Lauf!'}, {'en': 'Wow!', 'de': 'Potzdonner!'}, {'en': 'Wow!', 'de': 'Donnerwetter!'}, {'en': 'Duck!', 'de': 'Kopf runter!'}, {'en': 'Fire!', 'de': 'Feuer!'}, {'en': 'Help!', 'de': 'Hilfe!'}]


Importem splitters d'anglés i alemany

In [None]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [None]:
string = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

Millorem funció de split tenint en compte les majúscules

Para cada par de oraciones añadimos a más su split correspondiente con el incio y fin de frase

In [None]:
def tokenize_example(data, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    for i in range(len(data)):
        en_tokens = [token.text for token in en_nlp.tokenizer(data[i]["en"])][:max_length]
        de_tokens = [token.text for token in de_nlp.tokenizer(data[i]["de"])][:max_length]
        if lower:
            en_tokens = [token.lower() for token in en_tokens]
            de_tokens = [token.lower() for token in de_tokens]
        en_tokens = [sos_token] + en_tokens + [eos_token]
        de_tokens = [sos_token] + de_tokens + [eos_token]
        data[i]['en_tokens'] = en_tokens
        data[i]['de_tokens'] = de_tokens
    return data

In [None]:
train = data[:1000]
max_length = 1000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "example": train,
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
inputs = [train,en_nlp,de_nlp,max_length,lower,sos_token,eos_token]

sos = start of sentence,
eos = end of sentence,
unk = unknown,
pad = padding

In [None]:
train = tokenize_example(*inputs)
train[0]

{'en': 'Go.',
 'de': 'Geh.',
 'en_tokens': ['<sos>', 'go', '.', '<eos>'],
 'de_tokens': ['<sos>', 'geh', '.', '<eos>']}

In [None]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]


all_en_tokens, all_de_tokens = [], []
for frase in train:
    all_en_tokens.append(frase['en_tokens'])
    all_de_tokens.append(frase['de_tokens'])

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    all_en_tokens,
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    all_de_tokens,
    min_freq=min_freq,
    specials=special_tokens,
)

In [None]:
words_en, words_de = en_vocab.get_itos(), de_vocab.get_itos()
len(words_en), len(words_de)

(262, 297)

In [None]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [None]:
"man" in words_en

True

In [None]:
def numericalize_example(data, en_vocab, de_vocab):
    for i in range(len(data)):
        en_ids = en_vocab.lookup_indices(data[i]["en_tokens"])
        de_ids = de_vocab.lookup_indices(data[i]["de_tokens"])
        data[i]['en_ids'] = en_ids
        data[i]['de_ids'] = de_ids
    return data

In [None]:
train = numericalize_example(train, en_vocab, de_vocab)
train

[{'en': 'Go.',
  'de': 'Geh.',
  'en_tokens': ['<sos>', 'go', '.', '<eos>'],
  'de_tokens': ['<sos>', 'geh', '.', '<eos>'],
  'en_ids': [2, 10, 4, 3],
  'de_ids': [2, 27, 4, 3]},
 {'en': 'Hi.',
  'de': 'Hallo!',
  'en_tokens': ['<sos>', 'hi', '.', '<eos>'],
  'de_tokens': ['<sos>', 'hallo', '!', '<eos>'],
  'en_ids': [2, 141, 4, 3],
  'de_ids': [2, 99, 5, 3]},
 {'en': 'Hi.',
  'de': 'Grüß Gott!',
  'en_tokens': ['<sos>', 'hi', '.', '<eos>'],
  'de_tokens': ['<sos>', 'grüß', 'gott', '!', '<eos>'],
  'en_ids': [2, 141, 4, 3],
  'de_ids': [2, 0, 0, 5, 3]},
 {'en': 'Run!',
  'de': 'Lauf!',
  'en_tokens': ['<sos>', 'run', '!', '<eos>'],
  'de_tokens': ['<sos>', 'lauf', '!', '<eos>'],
  'en_ids': [2, 61, 7, 3],
  'de_ids': [2, 146, 5, 3]},
 {'en': 'Run.',
  'de': 'Lauf!',
  'en_tokens': ['<sos>', 'run', '.', '<eos>'],
  'de_tokens': ['<sos>', 'lauf', '!', '<eos>'],
  'en_ids': [2, 61, 4, 3],
  'de_ids': [2, 146, 5, 3]},
 {'en': 'Wow!',
  'de': 'Potzdonner!',
  'en_tokens': ['<sos>', 'wow', '

L'equivalència del tokens i index és correcte

In [None]:
print(train[100]["en_tokens"])
print(en_vocab.lookup_tokens(train[100]["en_ids"]))
train[100]["en_tokens"] == en_vocab.lookup_tokens(train[100]["en_ids"])

['<sos>', 'no', 'way', '!', '<eos>']
['<sos>', 'no', 'way', '!', '<eos>']


True