In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
from torch.distributions import Categorical

In [2]:
class TicTacToeAgent(nn.Module):
    def __init__(self):
        super(TicTacToeAgent, self).__init__()
        # Сверточные слои для обработки состояния в виде матрицы
        self.conv1 = nn.Conv2d(1, 4, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(4)
        self.conv2 = nn.Conv2d(4, 8, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(8)
        self.conv3 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(16)
        # Финальный сверточный слой для получения выходного канала с вероятностями
        self.conv4 = nn.Conv2d(16, 1, kernel_size=3, padding=1)
        self.dropout = nn.Dropout(p=0.1)
        self.player = 1

        self.start_epsilon = 1.
        self.end_epsilon = 0.01
        

    def forward(self, x):

        mask = (x != 0).float()
        
        x = x * self.player

        # Первый сверточный блок
        x = F.leaky_relu(self.bn1(self.conv1(x)), negative_slope=0.1)
        x = self.dropout(x)
        
        # Второй сверточный блок
        x = F.leaky_relu(self.bn2(self.conv2(x)), negative_slope=0.1)
        x = self.dropout(x)
        
        # Третий сверточный блок
        x = F.leaky_relu(self.bn3(self.conv3(x)), negative_slope=0.1)
        x = self.dropout(x)
        
        # Выходной слой
        x = self.conv4(x)

        # Применяем маску, обнуляя выходные значения на занятых клетках
        x = x.masked_fill(mask == 1, float('-inf'))
        
        # Применяем softmax для получения вероятностей
        batch_size, _, height, width = x.shape
        x = x.view(batch_size, -1)  # Преобразуем в [batch_size, num_fields]
        x = F.softmax(x, dim=1)     # Применяем softmax по полям
        x = x.view(batch_size, height, width)  # Восстанавливаем матрицу вероятностей
        
        return x  # Выходная матрица вероятностей
    def select_move(self, probabilities):
        # Преобразуем вероятности в одномерный вид
        batch_size, height, width = probabilities.shape
        probabilities = probabilities.view(batch_size, -1)  # [batch_size, 9]
        
        # Для каждого элемента в batch выбираем ход на основе вероятностей
        distribution = Categorical(probabilities)
        move_index = distribution.sample()  # Индекс выбранной ячейки
        
        # Преобразуем индекс в координаты строки и столбца
        row, col = divmod(move_index.item(), width)
        
        return row, col


In [3]:
# Пример использования
agent = TicTacToeAgent()
input_tensor = torch.tensor([[[[-1., -1., 0.], [1., 1., 0.], [1., 1., 0.]]]])
# input_tensor = torch.zeros(1, 1, 3, 3)  # Входная матрица 3x3
output = agent(input_tensor)
print(output)
print(output.shape)  # Ожидаемый результат: [1, 3, 3] для 3x3 входа
print(agent.select_move(output))

tensor([[[0.0000, 0.0000, 0.2887],
         [0.0000, 0.0000, 0.3804],
         [0.0000, 0.0000, 0.3309]]], grad_fn=<ViewBackward0>)
torch.Size([1, 3, 3])
(2, 2)


In [22]:
class TicTacToeEnv:
    def __init__(self, size = 3, device = 'cuda'):
        self.size = size
        self.board = torch.zeros(self.size, self.size, dtype=torch.float32).to(device=device)
        self.device = device


    def reset(self):
        self.board = torch.zeros(self.size, self.size, dtype=torch.float32).to(device=self.device)
        self.current_player = 1
        return self.board.unsqueeze(0).unsqueeze(0)  # [1, 1, 3, 3]
    
    def get_board(self):
        return self.board

    def get_valid_actions(self):
        valid_actions = (self.board == 0).nonzero(as_tuple=True)  # Получаем индексы доступных клеток
        return list(zip(valid_actions[0].tolist(), valid_actions[1].tolist()))


    def step(self, action, player):
        row, col = action
        if self.board[row, col] != 0:
            raise ValueError("Invalid move")
        
        self.board[row, col] = player
        reward, done = self.check_winner33()
        return self.board.unsqueeze(0).unsqueeze(0), reward, done

    def check_winner33(self):
        for i in range(3):
            if abs(self.board[i, :].sum()) == 3 or abs(self.board[:, i].sum()) == 3:
                return 1 * self.current_player, True
        if abs(self.board.trace()) == 3 or abs(torch.fliplr(self.board).trace()) == 3:
            return 1 * self.current_player, True
        if (self.board == 0).sum() == 0:
            return 0, True
        return 0, False

In [5]:
def check_victory(board):
    """
    Проверяет, есть ли победа для одного из игроков на доске.
    
    Аргументы:
        board (torch.Tensor): тензор размером (1, 1, N, N) с элементами -1, 0, 1.
        
    Возвращает:
        int: 1, если победа игрока 1; -1, если победа игрока -1; 0, если победителей нет.
    """
    board = board.squeeze()  # Удаляем размерности (1, 1, N, N) -> (N, N)
    N = board.size(0)

    # Проверяем строки и столбцы на наличие тройки подряд
    for i in range(N):
        for j in range(N - 2):  # Проходим по всем возможным начальным индексам тройки
            # Проверка строки
            if board[i, j] == board[i, j + 1] == board[i, j + 2] and board[i, j] != 0:
                return board[i, j]
            # Проверка столбца
            if board[j, i] == board[j + 1, i] == board[j + 2, i] and board[j, i] != 0:
                return board[j, i]
    
    # Проверяем диагонали на наличие тройки подряд
    for i in range(N - 2):
        for j in range(N - 2):
            # Проверка главной диагонали
            if board[i, j] == board[i + 1, j + 1] == board[i + 2, j + 2] and board[i, j] != 0:
                return board[i, j]
            # Проверка побочной диагонали
            if board[i, j + 2] == board[i + 1, j + 1] == board[i + 2, j] and board[i, j + 2] != 0:
                return board[i, j + 2]
    
    # Если нет победителя
    return 0

# Пример использования
board = torch.tensor([[[[0, 1, -1],
                        [-1, -1, 1],
                        [-1, 0, 0]
                        ]]])  # Размер (1, 1, 4, 4)

result = check_victory(board)
if result == 1:
    print("Победа игрока 1")
elif result == -1:
    print("Победа игрока -1")
else:
    print("Ничья")

Победа игрока -1


In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Пример использования с вашей моделью

print(f"Количество обучаемых параметров: {count_parameters(agent)}")

Количество обучаемых параметров: 1705


In [59]:
import random


def agent_eval(agent, env, num_games=100):
    """
    Оценка процента побед агента:
    - Против случайного игрока.
    :param agent: Агент, который делает предсказания ходов.
    :param env: Среда игры.
    :param num_games: Количество игр для оценки.
    :return: Словарь с процентами побед.
    """
    def play_game(agent):
        """Запуск одной игры между двумя агентами."""
        env.reset()
        done = False
        player = 1

        while not done:
            state = env.board.unsqueeze(0).unsqueeze(0) * player

            if player == 1:
                probabilities = agent(state)
                row, col = agent.select_move(probabilities)
            else:
                # Если agent2 — случайный игрок, выбираем случайный ход
                valid_actions = env.get_valid_actions()
                if valid_actions:
                    row, col = random.choice(valid_actions)
                else: return 0

            # Применяем ход
            _, done, info = env.step((row, col), player)
            player *= -1

        return info  # 1 = победа первого игрока, -1 = победа второго игрока, 0 = ничья

    def evaluate(agent):
        """Оценка процента побед первого агента."""
        results = {"agent1_wins": 0, "agent2_wins": 0, "draws": 0}

        for _ in range(num_games):
            winner = play_game(agent)
            if winner == 1:
                results["agent1_wins"] += 1
            elif winner == -1:
                results["agent2_wins"] += 1
            else:
                results["draws"] += 1

        total_games = num_games
        results["agent1_win_rate"] = results["agent1_wins"] / total_games * 100
        results["agent2_win_rate"] = results["agent2_wins"] / total_games * 100
        results["draw_rate"] = results["draws"] / total_games * 100
        return results


    # Оценка против случайного игрока
    print("Evaluating against random player...")
    random_play_results = evaluate(agent)

    return {
        "random_play": random_play_results
    }

In [None]:
## 1 версия

agent = TicTacToeAgent().cuda()
optimizer = optim.Adam(agent.parameters(), lr=0.001)
env = TicTacToeEnv()

num_episodes = 20000
gamma = 0.99  # Коэффициент дисконтирования

eps_start = 0.5
eps_min = 0.01
decay = 0.99

vals = []

for episode in range(num_episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False

    while not done:
        # Получение вероятностей для действий
        probabilities = agent(state)
        row, col = agent.select_move(probabilities)

        # Выполнение действия в среде
        next_state, reward, done = env.step((row, col), agent.player)

        # Сохранение логарифма вероятности выбранного действия
        log_prob = torch.log(probabilities[0, row, col])
        log_probs.append(log_prob)

        # Сохранение вознаграждения
        rewards.append(reward)
        
        agent.player *=-1
        state = next_state
    # Вычисление возврата (return) для каждого шага
    returns = []
    G = 0
    player = 1
    for r in reversed(rewards):
        G = r + gamma * G
        G = G * player
        player *=-1
        returns.insert(0, G)
    returns = torch.tensor(returns)

    # Нормализация
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)

    # Вычисление функции потерь
    loss = torch.stack([-log_prob * G for log_prob, G in zip(log_probs, returns)]).sum()

    # Обновление параметров модели
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if episode % 1000 == 0:
        eval = agent_eval(agent=agent, env=env)
        print(f"{episode=}, {eval}")
        vals.append(eval)

Evaluating against random player...
episode=0, {'random_play': {'agent1_wins': 91, 'agent2_wins': 0, 'draws': 9, 'agent1_win_rate': 91.0, 'agent2_win_rate': 0.0, 'draw_rate': 9.0}}
Evaluating against random player...
episode=1000, {'random_play': {'agent1_wins': 95, 'agent2_wins': 0, 'draws': 5, 'agent1_win_rate': 95.0, 'agent2_win_rate': 0.0, 'draw_rate': 5.0}}
Evaluating against random player...
episode=2000, {'random_play': {'agent1_wins': 98, 'agent2_wins': 0, 'draws': 2, 'agent1_win_rate': 98.0, 'agent2_win_rate': 0.0, 'draw_rate': 2.0}}
Evaluating against random player...
episode=3000, {'random_play': {'agent1_wins': 89, 'agent2_wins': 0, 'draws': 11, 'agent1_win_rate': 89.0, 'agent2_win_rate': 0.0, 'draw_rate': 11.0}}
Evaluating against random player...
episode=4000, {'random_play': {'agent1_wins': 94, 'agent2_wins': 0, 'draws': 6, 'agent1_win_rate': 94.0, 'agent2_win_rate': 0.0, 'draw_rate': 6.0}}
Evaluating against random player...
episode=5000, {'random_play': {'agent1_wins':

KeyboardInterrupt: 

In [71]:
## 2 версия

agent = TicTacToeAgent().cuda()
optimizer = optim.AdamW(agent.parameters(), lr=0.0001)
env = TicTacToeEnv()

num_episodes = 1000
gamma = 0.99  # Коэффициент дисконтирования
eps_start = 0.5
eps_min = 0.01
decay = 0.99

for episode in range(num_episodes):
    state = env.reset()
    log_probs_player1 = []
    log_probs_player2 = []
    rewards_player1 = []
    rewards_player2 = []

    done = False
    agent.player = 1  # Начинаем с игрока 1

    while not done:
        # Получение вероятностей для текущего игрока
          # Задаем игрока (1 или -1)
        probabilities = agent(state)
        row, col = agent.select_move(probabilities)

        # Выполнение действия в среде
        next_state, reward, done = env.step((row, col), agent.player)

        # Сохранение логарифма вероятности действия и награды
        log_prob = torch.log(probabilities[0, row, col])
        if agent.player == 1:
            log_probs_player1.append(log_prob)
            rewards_player1.append(reward)
        else:
            log_probs_player2.append(log_prob)
            rewards_player2.append(-reward)  # Инверсия награды для второго игрока

        state = next_state
        agent.player *= -1  # Смена игрока
    # current_player *= -1
    
    print(f"{rewards_player1=}")
    print(f"{rewards_player2=}")
    # Вычисление возвратов отдельно для каждого игрока
    def compute_returns(rewards, gamma):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        return torch.tensor(returns)

    returns_player1 = compute_returns(rewards_player1, gamma)
    returns_player2 = compute_returns(rewards_player2, gamma)

    # Нормализация возвратов
    returns_player1 = (returns_player1 - returns_player1.mean()) / (returns_player1.std() + 1e-8)
    returns_player2 = (returns_player2 - returns_player2.mean()) / (returns_player2.std() + 1e-8)

    # Вычисление потерь отдельно для каждого игрока
    loss_player1 = torch.stack([-log_prob * G for log_prob, G in zip(log_probs_player1, returns_player1)]).sum()
    loss_player2 = torch.stack([-log_prob * G for log_prob, G in zip(log_probs_player2, returns_player2)]).sum()

    # Итоговая функция потерь
    total_loss = loss_player1 + loss_player2

    # Обновление параметров модели
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    # if episode % 100 == 0:
    #     print(f"Episode {episode}, Total Loss: {total_loss.item()}")

rewards_player1=[0, 0, 0, 0]
rewards_player2=[0, 0, 0, -1]
rewards_player1=[0, 0, 0, 0, 0]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 1]
rewards_player2=[0, 0, 0]
rewards_player1=[0, 0, 0, 0, 0]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 1]
rewards_player2=[0, 0, 0]
rewards_player1=[0, 0, 0, 0, 1]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 0, 1]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 0, 0]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 1]
rewards_player2=[0, 0, 0]
rewards_player1=[0, 0, 0, 1]
rewards_player2=[0, 0, 0]
rewards_player1=[0, 0, 0, 0]
rewards_player2=[0, 0, 0, -1]
rewards_player1=[0, 0, 0, 0]
rewards_player2=[0, 0, 0, -1]
rewards_player1=[0, 0, 0, 1]
rewards_player2=[0, 0, 0]
rewards_player1=[0, 0, 0, 1]
rewards_player2=[0, 0, 0]
rewards_player1=[0, 0, 0, 0, 1]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 0, 1]
rewards_player2=[0, 0, 0, 0]
rewards_player1=[0, 0, 0, 0]
rewards_player2=[0, 0, 0, -1]
rewards

In [73]:
agent.player = 1
input_tensor = torch.tensor([[[[0.,  -1,   -1.], 
                               [0., 0.,  0.], 
                               [0.,  1.,  1.]]]]).cuda()
# input_tensor = torch.zeros(1, 1, 3, 3)  # Входная матрица 3x3
agent.eval()
output = agent(input_tensor)
print(agent.select_move(output))
output

(1, 1)


tensor([[[0.3011, 0.0000, 0.0000],
         [0.2840, 0.1197, 0.1309],
         [0.1643, 0.0000, 0.0000]]], device='cuda:0', grad_fn=<ViewBackward0>)

In [116]:
agent.player

1

In [56]:
env = TicTacToeEnv()
env.reset()
env.get_valid_actions()
# Оценка агента
results = agent_eval(agent, env)
print(results)

Evaluating against random player...
{'random_play': {'agent1_wins': 93, 'agent2_wins': 0, 'draws': 7, 'agent1_win_rate': 93.0, 'agent2_win_rate': 0.0, 'draw_rate': 7.000000000000001}}
