In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

In [2]:
# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [17]:
# 加载文本数据
with open('data/time_machine_txt/timemachine.txt', 'r') as f:
    text = f.read()

# 创建字符映射表
chars = sorted(list(set(text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for idx, ch in enumerate(chars)}

# 转换文本为索引
text_as_int = np.array([char_to_idx[c] for c in text]) 

In [39]:
len(text_as_int), text_as_int.size

(178979, 178979)

In [None]:
# len(chars), chars
# char_to_idx

In [47]:
# 定义超参数
seq_length = 100  # 序列长度 - 一个句子100个单词
batch_size = 512
hidden_size = 256
embedding_dim = 64 # input_size 嵌入向量的大小
num_layers = 2
learning_rate = 0.001
num_epochs = 10
vocab_size = len(chars) # 字符的个数

def create_dataset(text_as_int, seq_length, batch_size):
    sequences = []
    targets = []
    for i in range(0, len(text_as_int) - seq_length):
        sequences.append(text_as_int[i:i + seq_length])
        targets.append(text_as_int[i + 1:i + seq_length + 1])
    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)
    # return sequences, targets
    dataset = torch.utils.data.TensorDataset(sequences, targets)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

dataloader = create_dataset(text_as_int, seq_length, batch_size)
# sequences, targets = create_dataset(text_as_int, seq_length, batch_size)

In [43]:
sequences.shape, targets.shape

(torch.Size([178879, 100]), torch.Size([178879, 100]))

In [46]:
sequences[0:2, ], targets[0:2, ]

(tensor([[35, 51, 48,  1, 35, 52, 56, 48,  1, 28, 44, 46, 51, 52, 57, 48,  7,  1,
          45, 68,  1, 23,  9,  1, 22,  9,  1, 38, 48, 55, 55, 62,  1, 41, 10, 11,
          12, 11, 42,  0,  0,  0,  0,  0, 24,  0,  0,  0, 35, 51, 48,  1, 35, 52,
          56, 48,  1, 35, 61, 44, 65, 48, 55, 55, 48, 61,  1,  5, 49, 58, 61,  1,
          62, 58,  1, 52, 63,  1, 66, 52, 55, 55,  1, 45, 48,  1, 46, 58, 57, 65,
          48, 57, 52, 48, 57, 63,  1, 63, 58,  1],
         [51, 48,  1, 35, 52, 56, 48,  1, 28, 44, 46, 51, 52, 57, 48,  7,  1, 45,
          68,  1, 23,  9,  1, 22,  9,  1, 38, 48, 55, 55, 62,  1, 41, 10, 11, 12,
          11, 42,  0,  0,  0,  0,  0, 24,  0,  0,  0, 35, 51, 48,  1, 35, 52, 56,
          48,  1, 35, 61, 44, 65, 48, 55, 55, 48, 61,  1,  5, 49, 58, 61,  1, 62,
          58,  1, 52, 63,  1, 66, 52, 55, 55,  1, 45, 48,  1, 46, 58, 57, 65, 48,
          57, 52, 48, 57, 63,  1, 63, 58,  1, 62]]),
 tensor([[51, 48,  1, 35, 52, 56, 48,  1, 28, 44, 46, 51, 52, 57, 48,  7,  1

In [51]:
# 遍历DataLoader并查看一个批次的数据形状
for batch_idx, (batch_data, batch_labels) in enumerate(dataloader):
    print(batch_idx)
    print("Data shape:", batch_data.shape)
    print("Labels shape:", batch_labels.shape)
    break  # 只查看第一个批次

0
Data shape: torch.Size([512, 100])
Labels shape: torch.Size([512, 100])


In [56]:
# 定义RNN模型
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        # print("***", x.shape)
        out, hidden = self.rnn(x, hidden)
        out = out.contiguous().view(-1, hidden.size(2))
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(num_layers, batch_size, hidden_size).to(device)

In [61]:
# 实例化模型
model = RNNModel(vocab_size, embedding_dim, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型并计算困惑度
for epoch in range(num_epochs):
    total_loss = 0
    for i, (inputs, targets) in enumerate(dataloader):
        
        # inputs = inputs.view(batch_size, -1).to(device)
        # 获取当前批次的实际大小
        current_batch_size = inputs.size(0)
        # if current_batch_size != 512:
        #     print(current_batch_size)
        inputs = inputs.view(current_batch_size, -1).to(device)
        
        targets = targets.view(-1).to(device)

        optimizer.zero_grad()
        
        # 初始化隐藏状态并将其从计算图中分离
        # hidden = model.init_hidden(batch_size).detach()
        hidden = model.init_hidden(current_batch_size).detach()
        
        output, hidden = model(inputs, hidden)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if i % 100 == 0 and i > 0:
            avg_loss = total_loss / 100
            perplexity = torch.exp(torch.tensor(avg_loss))
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}')
            total_loss = 0

Epoch [1/10], Step [101/350], Loss: 2.5089, Perplexity: 12.2911
Epoch [1/10], Step [201/350], Loss: 1.8286, Perplexity: 6.2253
Epoch [1/10], Step [301/350], Loss: 1.5851, Perplexity: 4.8799
191
Epoch [2/10], Step [101/350], Loss: 1.3923, Perplexity: 4.0242
Epoch [2/10], Step [201/350], Loss: 1.2839, Perplexity: 3.6108
Epoch [2/10], Step [301/350], Loss: 1.2067, Perplexity: 3.3426
191
Epoch [3/10], Step [101/350], Loss: 1.1101, Perplexity: 3.0348
Epoch [3/10], Step [201/350], Loss: 1.0314, Perplexity: 2.8050
Epoch [3/10], Step [301/350], Loss: 0.9667, Perplexity: 2.6292
191
Epoch [4/10], Step [101/350], Loss: 0.8832, Perplexity: 2.4187
Epoch [4/10], Step [201/350], Loss: 0.8171, Perplexity: 2.2639
Epoch [4/10], Step [301/350], Loss: 0.7649, Perplexity: 2.1488
191
Epoch [5/10], Step [101/350], Loss: 0.6996, Perplexity: 2.0129
Epoch [5/10], Step [201/350], Loss: 0.6533, Perplexity: 1.9219
Epoch [5/10], Step [301/350], Loss: 0.6173, Perplexity: 1.8540
191
Epoch [6/10], Step [101/350], Loss

In [67]:
# 生成文本
def generate_text(model, start_str, length):
    model.eval()
    hidden = model.init_hidden(1)
    input = torch.tensor([char_to_idx[ch] for ch in start_str], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = start_str
    for _ in range(length):
        output, hidden = model(input, hidden)
        _, top_idx = torch.topk(output[-1], 1)
        next_char = idx_to_char[top_idx.item()]
        generated_text += next_char
        input = torch.tensor([[top_idx]], dtype=torch.long).to(device)
    
    return generated_text

start_str = "Filby became pensive. "
generated_text = generate_text(model, start_str, 200)
print(generated_text)

Filby became pensive. It was an altogether new element in the
sexuritations, but simply stood rouner.

'Ithe but myself and onces, the flames of the burning forest, I tried to convous sure of a
solitys, their little eyes t
