In [1]:
!pip uninstall -y fsspec datasets
!pip install fsspec==2023.6.0 datasets --no-cache-dir

Found existing installation: fsspec 2023.6.0
Uninstalling fsspec-2023.6.0:
  Successfully uninstalled fsspec-2023.6.0
Found existing installation: datasets 3.6.0
Uninstalling datasets-3.6.0:
  Successfully uninstalled datasets-3.6.0
Collecting fsspec==2023.6.0
  Downloading fsspec-2023.6.0-py3-none-any.whl.metadata (6.7 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
to

In [2]:
!pip install datasets --quiet

In [3]:
from datasets import load_dataset
import torch
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import random

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando dispositivo:", device)

Usando dispositivo: cuda


In [5]:
# 1. Cargar todo el conjunto
dataset_full = load_dataset("imdb", split="train")

# 2. Seleccionar manualmente un subconjunto (por ejemplo, 2% ≈ 500 ejemplos)
dataset = dataset_full.select(range(500))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
def tokenize(text):
    return text.lower().split()

counter = Counter()
for ex in dataset:
    counter.update(tokenize(ex["text"]))

vocab = {"<pad>": 0, "<unk>": 1}
for i, word in enumerate(counter.keys(), start=2):
    vocab[word] = i
inv_vocab = {i: w for w, i in vocab.items()}

In [7]:
# Codificar texto a índices
def encode(text):
    return [vocab.get(w, vocab["<unk>"]) for w in tokenize(text)]

# Crear secuencias fijas
seq_len = 30
inputs, targets = [], []

for ex in dataset:
    encoded = encode(ex["text"])
    for i in range(len(encoded) - seq_len):
        seq = encoded[i:i+seq_len]
        tgt = encoded[i+seq_len]
        inputs.append(torch.tensor(seq))
        targets.append(torch.tensor(tgt))

In [8]:
from torch.utils.data import Dataset, DataLoader

class IMDbDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = IMDbDataset(inputs, targets)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [9]:
class WordLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size=128, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1])
        return out

In [10]:
model = WordLSTM(len(vocab)).to(device)

In [11]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

losses = []
perplexities = []
accuracies = []

for epoch in range(30):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        output = model(batch_X)
        loss = loss_fn(output, batch_y)

        # Accuracy
        pred = output.argmax(dim=1)
        correct += (pred == batch_y).sum().item()
        total += batch_y.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    accuracy = 100 * correct / total

    losses.append(avg_loss)
    perplexities.append(perplexity.item())
    accuracies.append(accuracy)

    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Perplexity: {perplexity:.2f} | Accuracy: {accuracy:.2f}%")

Epoch 1 | Loss: 7.2301 | Perplexity: 1380.35 | Accuracy: 8.72%
Epoch 2 | Loss: 6.0118 | Perplexity: 408.22 | Accuracy: 11.99%
Epoch 3 | Loss: 4.9111 | Perplexity: 135.79 | Accuracy: 16.28%
Epoch 4 | Loss: 4.0014 | Perplexity: 54.67 | Accuracy: 24.37%
Epoch 5 | Loss: 3.3489 | Perplexity: 28.47 | Accuracy: 32.45%
Epoch 6 | Loss: 2.8457 | Perplexity: 17.21 | Accuracy: 40.21%
Epoch 7 | Loss: 2.4604 | Perplexity: 11.71 | Accuracy: 46.96%
Epoch 8 | Loss: 2.1610 | Perplexity: 8.68 | Accuracy: 52.33%
Epoch 9 | Loss: 1.9297 | Perplexity: 6.89 | Accuracy: 56.52%
Epoch 10 | Loss: 1.7473 | Perplexity: 5.74 | Accuracy: 60.08%
Epoch 11 | Loss: 1.5983 | Perplexity: 4.94 | Accuracy: 62.77%
Epoch 12 | Loss: 1.4772 | Perplexity: 4.38 | Accuracy: 65.32%
Epoch 13 | Loss: 1.3783 | Perplexity: 3.97 | Accuracy: 67.05%
Epoch 14 | Loss: 1.3104 | Perplexity: 3.71 | Accuracy: 68.26%
Epoch 15 | Loss: 1.2396 | Perplexity: 3.45 | Accuracy: 69.90%
Epoch 16 | Loss: 1.1894 | Perplexity: 3.29 | Accuracy: 70.84%
Epoch 1

In [12]:
def generate_text(model, prompt, max_words=50):
    model.eval()
    tokens = encode(prompt)
    input_seq = torch.tensor(tokens[-30:], dtype=torch.long).unsqueeze(0).to(device)

    generated = tokens.copy()

    for _ in range(max_words):
        out = model(input_seq)
        next_token = torch.multinomial(torch.softmax(out[0], dim=0), 1).item()
        generated.append(next_token)
        input_seq = torch.cat([input_seq[:, 1:], torch.tensor([[next_token]]).to(device)], dim=1)

    return ' '.join(inv_vocab.get(t, "<unk>") for t in generated)


In [13]:
print(generate_text(model, "the movie was", max_words=30))

the movie was filmed, some fairly original place. there is another half way through the list, leave this one movie from the 1980s a pop soundtrack then what could even get off the


In [14]:
class WordRNN(nn.Module):
    def __init__(self, vocab_size, emb_size=128, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.RNN(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1])
        return out


In [15]:
model = WordRNN(len(vocab)).to(device)

In [16]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

losses = []
perplexities = []
accuracies = []

for epoch in range(10):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        output = model(batch_X)
        loss = loss_fn(output, batch_y)

        # Accuracy
        pred = output.argmax(dim=1)
        correct += (pred == batch_y).sum().item()
        total += batch_y.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    accuracy = 100 * correct / total

    losses.append(avg_loss)
    perplexities.append(perplexity.item())
    accuracies.append(accuracy)

    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Perplexity: {perplexity:.2f} | Accuracy: {accuracy:.2f}%")

Epoch 1 | Loss: 7.8027 | Perplexity: 2447.22 | Accuracy: 7.16%
Epoch 2 | Loss: 6.6052 | Perplexity: 738.92 | Accuracy: 9.21%
Epoch 3 | Loss: 5.8877 | Perplexity: 360.57 | Accuracy: 10.82%
Epoch 4 | Loss: 6.0756 | Perplexity: 435.11 | Accuracy: 10.61%
Epoch 5 | Loss: 6.5167 | Perplexity: 676.35 | Accuracy: 8.91%
Epoch 6 | Loss: 6.4840 | Perplexity: 654.60 | Accuracy: 8.67%
Epoch 7 | Loss: 6.7476 | Perplexity: 852.03 | Accuracy: 8.08%
Epoch 8 | Loss: 6.2922 | Perplexity: 540.32 | Accuracy: 9.29%
Epoch 9 | Loss: 5.9557 | Perplexity: 385.96 | Accuracy: 10.49%
Epoch 10 | Loss: 6.0162 | Perplexity: 410.03 | Accuracy: 10.29%


In [17]:
print(generate_text(model, "the movie was", max_words=30))

the movie was there are attractive to watch a copy (for by alfredo inane.it well, holy cage are pure and they a little and stuff. tristan price (jesse metcalfe), balls schools parody hackneyed
