In [1]:
from rl import PokemonEnv, DQNAgent, create_pokemon_teams

# Create environment
env = PokemonEnv(create_pokemon_teams)
# Initialize DQN agent
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Training hyperparameters
agent = DQNAgent(
    state_size=state_size,
    action_size=action_size,
    gamma=0.99,  # discount factor
    epsilon=1.0,
    epsilon_min=0.05,
    epsilon_decay=0.9999,
    learning_rate=0.0003,
    batch_size=128,
    buffer_size=200000,
    update_target_freq=1000,
    use_dueling=True,
    grad_clip_norm=10.0
)

# Training loop
n_episodes = 1000
for episode in range(n_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        # 행동 선택
        action = agent.choose_action(state, env.get_valid_actions())
        
        # 환경과 상호작용
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        # 경험 저장
        agent.remember(state, action, reward, next_state, done)
        
        # 상태 업데이트
        state = next_state
        total_reward += reward
        
        # 배치 학습 수행
        loss = agent.train_batch()
        
    # 에피소드 결과 기록
    if done:
        if terminated:  # 승리/패배로 종료된 경우
            if reward > 0:
                agent.add_win()
            else:
                agent.add_loss()
    
    # 진행상황 출력
    if episode % 10 == 0:
        win_rate = agent.get_recent_win_rate()
        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Win Rate: {win_rate:.2f}%")

print("Training completed!")


  return torch._C._cuda_getDeviceCount() > 0


Episode 0, Total Reward: -7.09, Win Rate: 0.00%
Episode 10, Total Reward: -9.05, Win Rate: 0.00%
Episode 20, Total Reward: -8.62, Win Rate: 0.00%
Episode 30, Total Reward: -11.68, Win Rate: 0.00%
Episode 40, Total Reward: -10.06, Win Rate: 0.00%
Episode 50, Total Reward: -8.34, Win Rate: 0.00%
Episode 60, Total Reward: -8.30, Win Rate: 1.64%
Episode 70, Total Reward: -8.51, Win Rate: 4.23%
Episode 80, Total Reward: -10.17, Win Rate: 3.70%
Episode 90, Total Reward: -12.02, Win Rate: 4.40%
Episode 100, Total Reward: -9.01, Win Rate: 5.00%
Episode 110, Total Reward: -10.01, Win Rate: 6.00%
Episode 120, Total Reward: -10.53, Win Rate: 7.00%
Episode 130, Total Reward: -8.46, Win Rate: 7.00%
Episode 140, Total Reward: -10.64, Win Rate: 8.00%
Episode 150, Total Reward: -10.05, Win Rate: 10.00%
Episode 160, Total Reward: -10.76, Win Rate: 10.00%
Episode 170, Total Reward: -9.84, Win Rate: 8.00%
Episode 180, Total Reward: -10.94, Win Rate: 9.00%
Episode 190, Total Reward: -10.36, Win Rate: 9.00