In [3]:
from datasets import load_dataset
from collections import Counter
import torch
from torch.utils.data import DataLoader

In [4]:
# CARGAMOS LOS DATOS
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
print("Estructura: ", dataset)

# VISUALIZAMOS CONTENIDO DEL TRAIN 
count = 0
sentences = []
for sentence in dataset["train"]:
    if sentence["text"].strip():
        # FILTRAMOS LAS LINEAS VACIAS 
        sentences.append(sentence["text"].strip())
        print("\nTexto:\n", sentence)
        count +=1
    if count == 5:
        break

Estructura:  DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

Texto:
 {'text': ' = Valkyria Chronicles III = \n'}

Texto:
 {'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pit

In [5]:
# TOKENIZADOR POR PALABRAS
def custom_tokenizer(text):
    return text.lower().split()

for i, s in enumerate(sentences):
    tokens = custom_tokenizer(s)
    print(f"\nTexto {i+1}: {tokens}")


Texto 1: ['=', 'valkyria', 'chronicles', 'iii', '=']

Texto 2: ['senjō', 'no', 'valkyria', '3', ':', 'unrecorded', 'chronicles', '(', 'japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'valkyria', 'of', 'the', 'battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'media.vision', 'for', 'the', 'playstation', 'portable', '.', 'released', 'in', 'january', '2011', 'in', 'japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', '.', 'employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', '"', 'nameless', '"', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia', 'during', 'the', 'second', 'europan', 'war',

In [6]:
# CONSTRUIMOS EL VOCABULARIO INICIAL
word_vocab = Counter()

for s in sentences:
    tokens = custom_tokenizer(s)
    word_vocab.update(tokens)

print(f"Total de palabras en el vocabulario: {len(word_vocab)}")
print("Vocabulario inicial:")
for word, freq in word_vocab.most_common():
    print(f"'{word}': {freq}")

Total de palabras en el vocabulario: 173
Vocabulario inicial:
'the': 22
',': 15
'.': 14
'valkyria': 10
'of': 9
'and': 8
'chronicles': 7
'game': 7
'in': 7
'=': 6
'it': 6
'a': 5
'with': 5
'was': 5
'to': 4
'series': 4
'"': 4
'iii': 3
':': 3
'as': 3
'japan': 3
'by': 3
'for': 3
'ii': 3
'3': 2
'japanese': 2
'is': 2
'tactical': 2
'@-@': 2
'video': 2
'media.vision': 2
'playstation': 2
'released': 2
'gameplay': 2
'unit': 2
'development': 2
'large': 2
'also': 2
'both': 2
'along': 2
''s': 2
'sales': 2
'an': 2
'expanded': 2
'edition': 2
'senjō': 1
'no': 1
'unrecorded': 1
'(': 1
'戦場のヴァルキュリア3': 1
'lit': 1
'battlefield': 1
')': 1
'commonly': 1
'referred': 1
'outside': 1
'role': 1
'playing': 1
'developed': 1
'sega': 1
'portable': 1
'january': 1
'2011': 1
'third': 1
'employing': 1
'same': 1
'fusion': 1
'real': 1
'time': 1
'its': 1
'predecessors': 1
'story': 1
'runs': 1
'parallel': 1
'first': 1
'follows': 1
'nameless': 1
'penal': 1
'military': 1
'serving': 1
'nation': 1
'gallia': 1
'during': 1
'second':

In [7]:
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Definir un conjunto de datos personalizado
class CustomDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocab):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.sentences[idx])
        # Convertir tokens a índices de tensor usando el vocabulario
        tensor_indices = [self.vocab[token] for token in tokens]
        return torch.tensor(tensor_indices)
    
# Tokenizador
tokenizer = get_tokenizer("basic_english")

# Construir el vocabulario
vocab = build_vocab_from_iterator(map(tokenizer, sentences))

# Crear una instancia de tu conjunto de datos personalizado
custom_dataset = CustomDataset(sentences, tokenizer, vocab)

print("Longitud del conjunto de datos personalizado:", len(custom_dataset))
print("Items de muestra:")
for i in range(5):
    sample_item = custom_dataset[i]
    print(f"Item {i + 1}: {sample_item}")


Longitud del conjunto de datos personalizado: 5
Items de muestra:
Item 1: tensor([ 9,  3,  6, 21,  9])
Item 2: tensor([149, 110,   3,  23, 163,   6,  45,  34, 172,   2,  96,   1,   3,   4,
          0,  58,  23,  46,   2,  65, 135,  15,  17,   3,   6,  21, 117,  22,
          2,  33,  11,  41, 141,  24, 124,  43,   7,  71,  18, 148,   5,  36,
          1,  44,  19,   0,  37, 125,   1,  38,   8,  95,  48,   8,  22,   2,
         10,  33,   0, 159,   7,   8,   0,   3,  14,   1,  77,   0, 144,  87,
          4,  41,   5, 133,  24, 160,  32,  17,  94, 129,   2,   0, 152, 142,
        120,  15,   0,  82,   7,   5,  83,   0, 107,   2,  11, 121, 103,  42,
        150,   0, 108,   4,  88,  76,   0, 146,  79, 164, 167, 122, 147,  60,
        115,   5,  56, 123,  54,   0,  92,  42,  62, 132,   1])
Item 3: tensor([  0,   7,  59,  29,   8,  47,   2,  63, 118,  11,  35, 126,   4,   0,
        168,  73, 113,   3,   6,  20,   1, 166,  10, 137,   0, 151,  81,   4,
          0,  14,   2,  10,  26, 162,

In [8]:
def indices_a_palabras(indices, vocab):
    return [vocab.get_itos()[idx] for idx in indices]

# MOSTRAMOS LAS PALABRAS 
print("\nTexto procesado: ")
for i in range(5):
    sample_item = custom_dataset[i]
    palabras = indices_a_palabras(sample_item.tolist(), vocab)
    print(f"Texto {i + 1}: {palabras}")


Texto procesado: 
Texto 1: ['=', 'valkyria', 'chronicles', 'iii', '=']
Texto 2: ['senjō', 'no', 'valkyria', '3', 'unrecorded', 'chronicles', '(', 'japanese', '戦場のヴァルキュリア3', ',', 'lit', '.', 'valkyria', 'of', 'the', 'battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'media', '.', 'vision', 'for', 'the', 'playstation', 'portable', '.', 'released', 'in', 'january', '2011', 'in', 'japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', '.', 'employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', 'nameless', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia', 'during', 'the', 'second', 'europan', '

In [10]:
# MOSTRAMOS LAS 10 PALABRAS MÁS FRECUENTES
palabras_frecuentes = {}

for sentence in sentences:
    tokens = tokenizer(sentence)
    for token in tokens:
        if token in vocab.get_stoi(): 
            palabras_frecuentes[token] = palabras_frecuentes.get(token, 0) + 1

print(f"Total de palabras en el vocabulario: {len(palabras_frecuentes)}")
print("Vocabulario inicial:")
for palabra, frecuencia in sorted(palabras_frecuentes.items(), key=lambda x: x[1], reverse=True):
    print(f"'{palabra}': {frecuencia}")

Total de palabras en el vocabulario: 173
Vocabulario inicial:
'the': 22
'.': 16
',': 15
'valkyria': 10
'of': 9
'and': 8
'chronicles': 7
'game': 7
'in': 7
'=': 6
'it': 6
'a': 5
'with': 5
'was': 5
'to': 4
'series': 4
'iii': 3
'as': 3
'japan': 3
'by': 3
'for': 3
'ii': 3
''': 3
'3': 2
'japanese': 2
'is': 2
'tactical': 2
'@-@': 2
'video': 2
'media': 2
'vision': 2
'playstation': 2
'released': 2
'gameplay': 2
'unit': 2
'development': 2
'large': 2
'also': 2
'both': 2
'along': 2
's': 2
'sales': 2
'an': 2
'expanded': 2
'edition': 2
'senjō': 1
'no': 1
'unrecorded': 1
'(': 1
'戦場のヴァルキュリア3': 1
'lit': 1
'battlefield': 1
')': 1
'commonly': 1
'referred': 1
'outside': 1
'role': 1
'playing': 1
'developed': 1
'sega': 1
'portable': 1
'january': 1
'2011': 1
'third': 1
'employing': 1
'same': 1
'fusion': 1
'real': 1
'time': 1
'its': 1
'predecessors': 1
'story': 1
'runs': 1
'parallel': 1
'first': 1
'follows': 1
'nameless': 1
'penal': 1
'military': 1
'serving': 1
'nation': 1
'gallia': 1
'during': 1
'second': 1
