定义环境

In [None]:
import numpy as np
import gym
from gym import spaces

class SuperTicTacToeEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.board_shape = (12, 12)  # 展平为 12x12 的棋盘 (为了简化)
        self.action_space = spaces.Discrete(80)
        self.observation_space = spaces.Box(low=0, high=2, shape=self.board_shape, dtype=np.int32)
        self.reset()

    def reset(self):
        self.board = np.full((12, 12), -1, dtype=np.int32)  # -1 表示不可用区域
        # 定义五个 4x4 区域
        centers = [(0, 4), (4, 0), (4, 4), (4, 8), (8, 4)]  # 上、左、中、右、下
        for (start_row, start_col) in centers:
            self.board[start_row:start_row+4, start_col:start_col+4] = 0  # 可落子区域
        self.current_player = 1
        return self.board.copy()

    def step(self, action):
        centers = [(0, 4), (4, 0), (4, 4), (4, 8), (8, 4)]
        num_box = int(action/16)
        row, col = divmod(action - num_box*16, 4)
        row, col = row + centers[num_box][0], col + centers[num_box][1]
        
        # if self.board[row, col] != 0:
        #     # 非法落子（已被占用或无效区域）
        #     return self.board.copy(), 0, False, {}

        # 模拟 50% 成功概率
        if np.random.rand() < 0.5:
            self.board[row, col] = 1 
        else:
            # 随机落子到邻近格子
            neighbors = [(row + dr, col + dc) for dr in [-1, 0, 1] for dc in [-1, 0, 1]
                         if not (dr == 0 and dc == 0)]
            np.random.shuffle(neighbors)
            placed = False
            for (r, c) in neighbors:
                if 0 <= r < 12 and 0 <= c < 12 and self.board[r, c] == 0:
                    self.board[r, c] = 1
                    placed = True
                    break
            if not placed:
                # forfeited move
                self.current_player = 3 - self.current_player
                return self.board.copy(), -0.01, False, {} # 
                
        # 检查是否胜利
        if self.check_winner():
            return self.board.copy(), 1, True, {}

        # 切换玩家
        self.current_player = 3 - self.current_player
        
        # Normalize board to current player's perspective:
        # Current player's pieces → 1
        # Opponent's pieces       → 2
        # Use 3 as a temporary placeholder
        self.board = np.where(self.board == 2, 3, self.board) 
        self.board = np.where(self.board == 1, 2, self.board)  
        self.board = np.where(self.board == 3, 1, self.board) 
        
        return self.board.copy(), 0, False, {}

    def check_winner(self):
        size = self.board.shape[0]

        for i in range(size):
            for j in range(size):
                # 横向
                if j + 3 < size:
                    if all(self.board[i, j+k] == 1 for k in range(4)):
                        return True
                # 纵向
                if i + 3 < size:
                    if all(self.board[i+k, j] == 1 for k in range(4)):
                        return True
                # ↘ 方向
                if i + 4 < size and j + 4 < size:
                    if all(self.board[i+k, j+k] == 1 for k in range(5)):
                        return True
                # ↙ 方向
                if i - 4 >= 0 and j + 4 < size:
                    if all(self.board[i-k, j+k] == 1 for k in range(5)):
                        return True

        return False
    
    def print_board(self):
        for row in self.board:
            print(" ".join(
                "." if x == 0 else
                "X" if x == 1 else
                "O" if x == 2 else
                " "  # -1 区域
                for x in row
            ))

测试环境

In [None]:
import random

# 创建环境
env = SuperTicTacToeEnv()
obs = env.reset()
done = False
step_count = 0

print("🔄 游戏开始：\n")

while not done:
    env.print_board()
    print(f"\n👣 Step {step_count + 1} - Player {env.current_player}")

    # 获取所有合法动作（即值为 0 的格子）
    legal_actions = [i for i in range(80) if env.board[i // 12, i % 12] == 0]

    if not legal_actions:
        print("❌ 没有合法动作，游戏结束")
        break

    action = random.choice(legal_actions)
    obs, reward, done, info = env.step(action)

    print(f"🎯 动作: {action} -> ({action // 12}, {action % 12})")
    print(f"🏆 奖励: {reward}")
    print("-" * 40)

    step_count += 1

print("\n✅ 游戏结束")
env.print_board()

网络结构

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_shape[0] * input_shape[1], 256)
        self.fc2 = nn.Linear(256, 128)
        self.out = nn.Linear(128, n_actions)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # flatten
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

DQN 训练

In [13]:
import random
from collections import deque

env = SuperTicTacToeEnv()
model = DQN(env.observation_space.shape, env.action_space.n)
target_model = DQN(env.observation_space.shape, env.action_space.n)
target_model.load_state_dict(model.state_dict())

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
replay_buffer = deque(maxlen=1000)
gamma = 0.99
batch_size = 64
epsilon = 1.0

for episode in range(2000):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_vals = model(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
                action = torch.argmax(q_vals).item()

        next_state, reward, done, _ = env.step(action)
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        # 学习
        if len(replay_buffer) >= batch_size:
            batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions)
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.bool)

            q_vals = model(states).gather(1, actions.unsqueeze(1)).squeeze()
            next_q_vals = target_model(next_states).max(1)[0]
            targets = rewards + gamma * next_q_vals * (~dones)

            loss = F.mse_loss(q_vals, targets.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    
    # 更新 target 网络
    if episode % 10 == 0:
        target_model.load_state_dict(model.state_dict())
    env.print_board()
    # 衰减探索率
    epsilon = max(0.01, epsilon * 0.995)

    print(f"Episode {episode}, Total Reward: {total_reward}")

        O . O .        
        . . . .        
        . . . .        
        . O . X        
O . . . X . . . . . . .
O . . . . . X X X X . .
. X X . . O . . . . . .
. . X X O O . X . . . .
        . . . O        
        . . . O        
        . X . O        
        O . . .        
Episode 0, Total Reward: 1
        X . O O        
        X O X X        
        X . O X        
        O . O X        
X X O O . X O O O X . X
O . . O X O . O . O O O
. O X X . X . O O X . .
X . . . X . O . . O X .
        . O X X        
        X O O .        
        O O . .        
        X X X X        
Episode 1, Total Reward: 1
        . . . X        
        . O X X        
        . . . .        
        . O . .        
. O . . O X X O . . O .
O O X X X X . O . X X O
O X X . . . . O O . . X
. . X X O O . . X . O .
        O . O X        
        . . . X        
        . . O X        
        X . . .        
Episode 2, Total Reward: 1
        X O O .        
        X X X X        
       

KeyboardInterrupt: 

In [14]:
from tqdm import tqdm
def play_against_random(env, model, print_game=False):
    state = env.reset()
    done = False
    current_player = env.current_player

    winner = 0  # 默认平局

    while not done:
        if current_player == 1:
            # DQN Agent's Turn
            with torch.no_grad():
                q_vals = model(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
                q_vals = q_vals.squeeze()

                legal_actions = [i for i in range(env.action_space.n)
                                 if env.board[i // 12, i % 12] == 0]

                if not legal_actions:
                    break

                q_vals_filtered = [(i, q_vals[i].item()) for i in legal_actions]
                action = max(q_vals_filtered, key=lambda x: x[1])[0]
        else:
            # Random Agent's Turn
            legal_actions = [i for i in range(env.action_space.n)
                             if env.board[i // 12, i % 12] == 0]
            if not legal_actions:
                break
            action = random.choice(legal_actions)

        # 执行动作
        next_state, reward, done, _ = env.step(action)
        state = next_state
        current_player = env.current_player

        if reward == 1:
            winner = 3 - current_player  # 胜者是执行动作前的玩家

        if print_game:
            print(f"\n🎯 Player {3 - current_player} takes action {action} ({action // 12}, {action % 12})")
            env.print_board()

    return winner  # 1 = DQN, 2 = Random, 0 = Draw


dqn_wins = 0
random_wins = 0
draws = 0
num_games = 1000

for i in tqdm(range(num_games)):
    winner = play_against_random(env, model, print_game=False)
    if winner == 1:
        dqn_wins += 1
    elif winner == 2:
        random_wins += 1
    else:
        draws += 1

    if (i + 1) % 100 == 0:
        print(f"Played {i+1} games...")

# 📊 输出最终结果
print("\n🎉 结果汇总:")
print(f"🏆 DQN Wins: {dqn_wins} ({dqn_wins / num_games:.2%})")
print(f"🎲 Random Wins: {random_wins} ({random_wins / num_games:.2%})")
print(f"🤝 Draws: {draws} ({draws / num_games:.2%})")

 10%|█         | 104/1000 [00:03<00:27, 32.58it/s]

Played 100 games...


 20%|██        | 202/1000 [00:06<00:21, 36.84it/s]

Played 200 games...


 31%|███       | 306/1000 [00:09<00:22, 31.32it/s]

Played 300 games...


 40%|████      | 403/1000 [00:12<00:21, 28.29it/s]

Played 400 games...


 50%|█████     | 503/1000 [00:15<00:15, 32.68it/s]

Played 500 games...


 61%|██████    | 606/1000 [00:18<00:10, 36.68it/s]

Played 600 games...


 70%|███████   | 703/1000 [00:21<00:10, 29.55it/s]

Played 700 games...


 81%|████████  | 806/1000 [00:24<00:05, 35.60it/s]

Played 800 games...


 91%|█████████ | 907/1000 [00:27<00:02, 34.94it/s]

Played 900 games...


100%|██████████| 1000/1000 [00:30<00:00, 33.20it/s]

Played 1000 games...

🎉 结果汇总:
🏆 DQN Wins: 862 (86.20%)
🎲 Random Wins: 138 (13.80%)
🤝 Draws: 0 (0.00%)





In [None]:
env.print_board()

        O . X .        
        . O O .        
        . . . .        
        . . . X        
. . . O . . . O O O X .
. . . . O . . O X X X .
. . . . O O X X . . . X
. . . O . . O . . X O X
        X X X .        
        . X . X        
        X X . X        
        X X . .        


In [None]:
print(env.step(79))
print(env.current_player)

(array([[-1, -1, -1, -1,  2,  0,  1,  0, -1, -1, -1, -1],
       [-1, -1, -1, -1,  0,  2,  2,  0, -1, -1, -1, -1],
       [-1, -1, -1, -1,  0,  0,  0,  0, -1, -1, -1, -1],
       [-1, -1, -1, -1,  0,  0,  0,  1, -1, -1, -1, -1],
       [ 0,  0,  0,  2,  0,  0,  0,  2,  2,  2,  1,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  2,  1,  1,  1,  0],
       [ 0,  0,  0,  0,  2,  2,  1,  1,  0,  0,  0,  1],
       [ 0,  0,  0,  2,  0,  0,  2,  0,  0,  1,  2,  1],
       [-1, -1, -1, -1,  1,  1,  1,  0, -1, -1, -1, -1],
       [-1, -1, -1, -1,  0,  1,  0,  1, -1, -1, -1, -1],
       [-1, -1, -1, -1,  1,  1,  0,  1, -1, -1, -1, -1],
       [-1, -1, -1, -1,  1,  1,  1,  0, -1, -1, -1, -1]]), 1, True, {})
2


In [None]:
env.board[6,7]

1

In [None]:
env.action_space.n

80