In [1]:
# simple_dqn.py
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

class SimpleDQN(nn.Module):
    """简化的DQN网络用于测试"""
    def __init__(self, input_size, output_size, hidden_size=128):
        super(SimpleDQN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
    
    def forward(self, x):
        return self.network(x)

class SimpleAgent:
    """简化的DQN智能体"""
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        
        self.model = SimpleDQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        batch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in batch:
            target = reward
            if not done:
                next_state = torch.FloatTensor(next_state)
                with torch.no_grad():
                    target = reward + self.gamma * torch.max(self.model(next_state)).item()
            
            state = torch.FloatTensor(state)
            target_f = self.model(state).detach().clone()
            target_f[action] = target
            
            self.optimizer.zero_grad()
            loss = self.loss_fn(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def test_simple_agent():
    """测试简化版智能体"""
    print("Testing simple DQN agent...")
    
    # 模拟一个简单的环境
    state_size = 4
    action_size = 2
    
    agent = SimpleAgent(state_size, action_size)
    
    # 模拟一些训练步骤
    for episode in range(5):
        state = np.random.randn(state_size)
        total_reward = 0
        
        for step in range(10):
            action = agent.act(state)
            next_state = state + np.random.randn(state_size) * 0.1
            reward = np.random.randn()
            done = step == 9
            
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
        
        agent.replay(32)
        print(f"Episode {episode + 1}, Reward: {total_reward:.2f}, Epsilon: {agent.epsilon:.3f}")
    
    print("Simple agent test completed!")

if __name__ == "__main__":
    test_simple_agent()

Testing simple DQN agent...
Episode 1, Reward: 3.41, Epsilon: 1.000
Episode 2, Reward: -5.54, Epsilon: 1.000
Episode 3, Reward: -5.37, Epsilon: 1.000
Episode 4, Reward: 3.99, Epsilon: 0.995
Episode 5, Reward: 1.38, Epsilon: 0.990
Simple agent test completed!
