In [20]:
import sys
sys.path.append('/home/jovyan/work')

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from src.data_loader import cargar_dataset, tokenize_sentences_by_char

In [24]:
# Definimos el modelo RNN
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(RNNModel, self).__init__()
        # Capa de embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Capa LSTM
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        # Capa de salida
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]  # Usamos la última salida del RNN
        out = self.fc(out)
        return out

In [26]:
# Función para entrenar el modelo
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            data, targets = batch
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader)}")

In [27]:
# Función para evaluar el modelo
def evaluate_model(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            data, targets = batch
            output = model(data)
            loss = criterion(output, targets)
            total_loss += loss.item()
    print(f"Validation Loss: {total_loss/len(val_loader)}")

In [28]:
# Definimos parámetros
embedding_dim = 100
hidden_dim = 128
output_size = 2  # Por ejemplo, 2 clases (o ajusta según tu tarea)
epochs = 5

In [31]:
# Cargamos el dataset
train_data = cargar_dataset(split="train")
print(train_data)  

val_data = cargar_dataset(split="validation")
print(val_data)  

Dataset({
    features: ['text'],
    num_rows: 1801350
})
Dataset({
    features: ['text'],
    num_rows: 3760
})


In [33]:
# Extraemos las oraciones de la columna 'text'
train_sentences = train_data['text']
val_sentences = val_data['text']

# Imprimimos un ejemplo para ver si se cargaron correctamente
print(train_sentences[:5])  
print(val_sentences[:5])   

['', ' = Valkyria Chronicles III = \n', '', ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for

In [None]:
# Tokenización y conversión a tensores
train_data = tokenize_sentences_by_char(train_sentences)
val_data = tokenize_sentences_by_char(val_sentences)

# Crear DataLoader
train_loader = DataLoader(list(zip(train_data)), batch_size=64, shuffle=True)
val_loader = DataLoader(list(zip(val_data)), batch_size=64, shuffle=False)

# Inicializamos el modelo, el criterio y el optimizador
vocab_size = len(set([char for sentence in train_data for char in sentence])) 
model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_size)

criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entrenamos el modelo
train_model(model, train_loader, criterion, optimizer, epochs=epochs)

# Evaluamos el modelo
evaluate_model(model, val_loader, criterion)