In [1]:
from utils.utils import LMDataset
import torch
from torch.utils.data import DataLoader
from utils.utils import Vectorizer
from lms.models import FFNLM
from pathlib import Path
import os

In [2]:
data_folder = Path.cwd() / Path('..').resolve() / Path('data', 'clean')

In [3]:
print('Loading corpus...')
texto = ''
lista_textos = [f for f in os.listdir(data_folder) if f.split('.')[-1] == 'txt']
for wiki_txt in lista_textos:
    print(f'\tReading {wiki_txt}...')
    with open(data_folder / Path(wiki_txt), encoding='utf-8') as fp:
        texto += fp.read()
    fp.close()
    print(f'¡Ok! Texto de longitud {len(texto.split())}')


Loading corpus...
	Reading wiki_perro_clean_1.txt...
¡Ok! Texto de longitud 7113


In [4]:
# --------------------------------------
# Loading Language Model
# --------------------------------------
window_length = 2
batch_size = 2
lm = FFNLM(vectorizer=Vectorizer(texto),
           window_length=window_length,
           hidden_size=20)
print('vocabulary_size: ',lm.vocabulary_size)
#print('vocabulary: ',lm.vectorizer.tokens)
    

vocabulary_size:  2122


In [6]:
lm.vectorizer.get_tokens('1.')

['1', '.']

In [8]:
print('1.' in lm.vectorizer.tokens)

True


In [15]:
lm.code_context(['1.', 'frente']).shape

shape_context:  (2,)


torch.Size([1, 6366])

In [16]:
lm.vectorizer.get_tokens(' '.join(['1.', 'frente']))

['1', '.', 'frente']

In [17]:
print(' '.join(['1.', 'frente']))

1. frente


In [5]:
# --------------------------------------
# Training
# --------------------------------------
parameters = {"learning_rate":1e-4,
            "window_length":window_length,
            "batch_size":batch_size,
            "num_epochs":50
}
print('Training...')
lm.train(texto=texto, parametros=parameters)
lm.save_model() 


Training...


  0%|                                                                                                  | 0/50 [00:00<?, ?it/s]

batch_index:  50 ds_features:  [['pelaje', 'es'], ['es', 'muy']]
porcentaje_batch_index:  1.27 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), torch.Size([4244])]
batch_index:  51 ds_features:  [['muy', 'diverso'], ['diverso', 'y']]
porcentaje_batch_index:  1.29 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), torch.Size([4244])]
batch_index:  52 ds_features:  [['y', 'varía'], ['varía', 'según']]
porcentaje_batch_index:  1.32 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), torch.Size([4244])]
batch_index:  53 ds_features:  [['según', 'la'], ['la', 'raza']]
porcentaje_batch_index:  1.34 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), torch.Size([4244])]
batch_index:  54 ds_features:  [['raza', '.'], ['.', 'posee']]
porcentaje_batch_index:  1.37 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), torch.Size([4244])]
batch_index:  55 ds_features:  [['posee', 'un'], ['un', 'oído']]
porcentaje_batch_index:  1.39 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), 

  0%|                                                                                                  | 0/50 [00:27<?, ?it/s]

Shapes: [torch.Size([4244]), torch.Size([4244])]
batch_index:  202 ds_features:  [[':', '1.'], ['1.', 'frente']]
porcentaje_batch_index:  5.11 %
shape_context:  (2, 2)
Shapes: [torch.Size([4244]), torch.Size([6366])]





RuntimeError: stack expects each tensor to be equal size, but got [4244] at entry 0 and [6366] at entry 1

In [None]:
# --------------------------------------
# Finding perplexity
# --------------------------------------
print('Text perplexity:', lm.perplexity(texto))