In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
from torch.distributions import Categorical

In [None]:
class TicTacToeAgent(nn.Module):
    def __init__(self):
        super(TicTacToeAgent, self).__init__()
        # Сверточные слои для обработки состояния в виде матрицы
        self.conv1 = nn.Conv2d(1, 4, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(4)
        self.conv2 = nn.Conv2d(4, 8, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(8)
        self.conv3 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(16)
        # Финальный сверточный слой для получения выходного канала с вероятностями
        self.conv4 = nn.Conv2d(16, 1, kernel_size=3, padding=1)
        self.dropout = nn.Dropout(p=0.3)
        self.player = 1

        self.start_epsilon = 1.
        self.end_epsilon = 0.01
        

    def forward(self, x):

        mask = (x != 0).float()
        
        x = x * self.player

        # Первый сверточный блок
        x = F.leaky_relu(self.bn1(self.conv1(x)), negative_slope=0.1)
        x = self.dropout(x)
        
        # Второй сверточный блок
        x = F.leaky_relu(self.bn2(self.conv2(x)), negative_slope=0.1)
        x = self.dropout(x)
        
        # Третий сверточный блок
        x = F.leaky_relu(self.bn3(self.conv3(x)), negative_slope=0.1)
        x = self.dropout(x)
        
        # Выходной слой
        x = self.conv4(x)

        # Применяем маску, обнуляя выходные значения на занятых клетках
        x = x.masked_fill(mask == 1, float('-inf'))
        
        # Применяем softmax для получения вероятностей
        batch_size, _, height, width = x.shape
        x = x.view(batch_size, -1)  # Преобразуем в [batch_size, num_fields]
        x = F.softmax(x, dim=1)     # Применяем softmax по полям
        x = x.view(batch_size, height, width)  # Восстанавливаем матрицу вероятностей
        
        return x  # Выходная матрица вероятностей
    def select_move(self, probabilities):
        # Преобразуем вероятности в одномерный вид
        batch_size, height, width = probabilities.shape
        probabilities = probabilities.view(batch_size, -1)  # [batch_size, 9]
        
        # Для каждого элемента в batch выбираем ход на основе вероятностей
        distribution = Categorical(probabilities)
        move_index = distribution.sample()  # Индекс выбранной ячейки
        
        # Преобразуем индекс в координаты строки и столбца
        row, col = divmod(move_index.item(), width)
        
        return row, col


In [None]:
# Пример использования
agent = TicTacToeAgent()
input_tensor = torch.tensor([[[[-1., -1., 0.], [1., 1., 0.], [1., 1., 0.]]]])
# input_tensor = torch.zeros(1, 1, 3, 3)  # Входная матрица 3x3
output = agent(input_tensor)
print(output)
print(output.shape)  # Ожидаемый результат: [1, 3, 3] для 3x3 входа
print(agent.select_move(output))

tensor([[[0.0000, 0.0000, 0.1989],
         [0.0000, 0.0000, 0.1779],
         [0.0000, 0.0000, 0.3307],
         [0.0000, 0.0000, 0.2925]]], grad_fn=<ViewBackward0>)
torch.Size([1, 4, 3])
(2, 2)


In [10]:
class TicTacToeEnv:
    def __init__(self):
        self.board = torch.zeros(3, 3, dtype=torch.float32)
        self.current_player = 1  # 1 или -1

    def reset(self):
        self.board = torch.zeros(3, 3, dtype=torch.float32)
        self.current_player = 1
        return self.board.unsqueeze(0).unsqueeze(0)  # [1, 1, 3, 3]

    def step(self, action):
        row, col = action
        if self.board[row, col] != 0:
            raise ValueError("Invalid move")
        
        self.board[row, col] = self.current_player
        reward, done = self.check_winner()
        self.current_player *= -1
        return self.board.unsqueeze(0).unsqueeze(0), reward, done

    def check_winner(self):
        for i in range(3):
            if abs(self.board[i, :].sum()) == 3 or abs(self.board[:, i].sum()) == 3:
                return 1 * self.current_player, True
        if abs(self.board.trace()) == 3 or abs(torch.fliplr(self.board).trace()) == 3:
            return 1 * self.current_player, True
        if (self.board == 0).sum() == 0:
            return 0, True
        return 0, False

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Пример использования с вашей моделью

print(f"Количество обучаемых параметров: {count_parameters(agent)}")

Количество обучаемых параметров: 1705


In [None]:
## Defining hyperparameters for the training

EPISODES = 4000000            # total no. of episodes
LR = 0.01                     # learning rate
GAMMA = 0.9                   # discount factor

max_epsilon = 1.0             # Greed: 100%
min_epsilon = 0.001           # Min_Greed: 0.1%
decay_rate = 0.000002         # epsilon decay rate
threshold = 2500              # no. of episodes after which states_tracked will be saved

In [None]:
## 1 версия

agent = TicTacToeAgent()
optimizer = optim.Adam(agent.parameters(), lr=0.001)
env = TicTacToeEnv()

num_episodes = 200000
gamma = 0.99  # Коэффициент дисконтирования

for episode in range(num_episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False

    while not done:
        # Получение вероятностей для действий
        probabilities = agent(state)
        row, col = agent.select_move(probabilities)

        # Выполнение действия в среде
        next_state, reward, done = env.step((row, col))

        # Сохранение логарифма вероятности выбранного действия
        log_prob = torch.log(probabilities[0, row, col])
        log_probs.append(log_prob)

        # Сохранение вознаграждения
        rewards.append(reward)
        
        agent.player *=-1
        state = next_state

    # Вычисление возврата (return) для каждого шага
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = torch.tensor(returns)

    # Нормализация возвратов
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)

    # Вычисление функции потерь
    loss = torch.stack([-log_prob * G for log_prob, G in zip(log_probs, returns)]).sum()

    # Обновление параметров модели
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if episode % 10000 == 0:
        print(f"Episode {episode}, Loss: {loss.item()}")

Episode 0, Loss: 3.5547351837158203
Episode 10000, Loss: 0.0
Episode 20000, Loss: 0.5711522698402405
Episode 30000, Loss: -0.16524383425712585
Episode 40000, Loss: -0.9223109483718872
Episode 50000, Loss: 0.0


KeyboardInterrupt: 

In [None]:
## 2 версия

agent = TicTacToeAgent()
optimizer = optim.AdamW(agent.parameters(), lr=0.001)
env = TicTacToeEnv()

num_episodes = 200000
gamma = 0.99  # Коэффициент дисконтирования
for episode in range(num_episodes):
    state = env.reset()
    log_probs_player1 = []
    log_probs_player2 = []
    rewards_player1 = []
    rewards_player2 = []

    done = False
    current_player = 1  # Начинаем с игрока 1

    while not done:
        # Получение вероятностей для текущего игрока
        agent.player = current_player  # Задаем игрока (1 или -1)
        probabilities = agent(state)
        row, col = agent.select_move(probabilities)

        # Выполнение действия в среде
        next_state, reward, done = env.step((row, col))

        # Сохранение логарифма вероятности действия и награды
        log_prob = torch.log(probabilities[0, row, col])
        if current_player == 1:
            log_probs_player1.append(log_prob)
            rewards_player1.append(reward)
        else:
            log_probs_player2.append(log_prob)
            rewards_player2.append(-reward)  # Инверсия награды для второго игрока

        state = next_state
        current_player *= -1  # Смена игрока

    # Вычисление возвратов отдельно для каждого игрока
    def compute_returns(rewards, gamma):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        return torch.tensor(returns)

    returns_player1 = compute_returns(rewards_player1, gamma)
    returns_player2 = compute_returns(rewards_player2, gamma)

    # Нормализация возвратов
    returns_player1 = (returns_player1 - returns_player1.mean()) / (returns_player1.std() + 1e-8)
    returns_player2 = (returns_player2 - returns_player2.mean()) / (returns_player2.std() + 1e-8)

    # Вычисление потерь отдельно для каждого игрока
    loss_player1 = torch.stack([-log_prob * G for log_prob, G in zip(log_probs_player1, returns_player1)]).sum()
    loss_player2 = torch.stack([-log_prob * G for log_prob, G in zip(log_probs_player2, returns_player2)]).sum()

    # Итоговая функция потерь
    total_loss = loss_player1 + loss_player2

    # Обновление параметров модели
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    if episode % 100 == 0:
        print(f"Episode {episode}, Total Loss: {total_loss.item()}")

Episode 0, Total Loss: -3.187407970428467
Episode 100, Total Loss: -0.8698832988739014
Episode 200, Total Loss: -2.6861014366149902
Episode 300, Total Loss: -1.2168773412704468
Episode 400, Total Loss: -3.3630428314208984
Episode 500, Total Loss: -2.005173921585083
Episode 600, Total Loss: 0.9468182325363159
Episode 700, Total Loss: -1.5972402095794678
Episode 800, Total Loss: -0.8824136257171631
Episode 900, Total Loss: -2.9883503913879395
Episode 1000, Total Loss: -1.6284780502319336
Episode 1100, Total Loss: -4.24084997177124
Episode 1200, Total Loss: -0.32937902212142944
Episode 1300, Total Loss: -0.38589954376220703
Episode 1400, Total Loss: -0.02849733829498291
Episode 1500, Total Loss: -2.030236005783081
Episode 1600, Total Loss: -0.44572097063064575
Episode 1700, Total Loss: -2.7869391441345215
Episode 1800, Total Loss: -0.9471529722213745
Episode 1900, Total Loss: -0.33421140909194946
Episode 2000, Total Loss: -1.6114428043365479
Episode 2100, Total Loss: -0.5644298791885376
E

KeyboardInterrupt: 

In [141]:
agent.player = 1
input_tensor = torch.tensor([[[[1.,  0,   1.], 
                               [-1., 1.,  0.], 
                               [-1.,  0.,  -1.]]]])
# input_tensor = torch.zeros(1, 1, 3, 3)  # Входная матрица 3x3
agent.eval()
output = agent(input_tensor)
print(agent.select_move(output))
output

(0, 1)


tensor([[[0.0000e+00, 1.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 9.3382e-10],
         [0.0000e+00, 3.4946e-12, 0.0000e+00]]], grad_fn=<ViewBackward0>)

In [116]:
agent.player

1