In [26]:
from collections import deque
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
from random import sample, choice
from math import exp, log
import random
from random import choice
from math import exp, log
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

#### Game

In [27]:
class ConnectFour:
    def __init__(self):
        # Initialize game parameters
        self.row_count = 6
        self.column_count = 7
        self.action_size = self.column_count
        self.in_a_row = 4
        
    def __repr__(self):
        return "ConnectFour"
        
    def get_initial_state(self):
        # Return an empty grid representing the initial state of the game
        return np.zeros((self.row_count, self.column_count))
    
    def get_next_state(self, state, action, player):
        # Update the state by placing the player's token in the specified column
        row = np.max(np.where(state[:, action] == 0))  # Find the lowest empty row in the selected column
        state[row, action] = player
        return state
    
    def get_valid_moves(self, state):
        if len(state.shape) == 3:
            # If the input is a batch of states, return valid moves for each state in the batch
            return (state[:, 0] == 0).astype(np.uint8)
        else:
            # Return a binary array indicating valid moves (columns) where tokens can be placed
            return (state[0] == 0).astype(np.uint8)
    
    def check_win(self, state, action):
        if action == None:
            return False
        
        # Get the row, column, and player corresponding to the last action
        row = np.min(np.where(state[:, action] != 0))
        column = action
        player = state[row][column]

        def count(offset_row, offset_column):
            # Helper function to count the number of consecutive tokens in a given direction
            for i in range(1, self.in_a_row):
                r = row + offset_row * i
                c = column + offset_column * i
                if (
                    r < 0 
                    or r >= self.row_count
                    or c < 0 
                    or c >= self.column_count
                    or state[r][c] != player
                ):
                    return i - 1
            return self.in_a_row - 1

        # Check for win conditions in vertical, horizontal, and diagonal directions
        return (
            count(1, 0) >= self.in_a_row - 1  # vertical
            or (count(0, 1) + count(0, -1)) >= self.in_a_row - 1  # horizontal
            or (count(1, 1) + count(-1, -1)) >= self.in_a_row - 1  # top left diagonal
            or (count(1, -1) + count(-1, 1)) >= self.in_a_row - 1  # top right diagonal
        )
    
    def get_value_and_terminated(self, state, action):
        # Check if the last action resulted in a win for the player
        if self.check_win(state, action):
            return 1, True
        # Check if the game ended in a draw
        if np.sum(self.get_valid_moves(state)) == 0:
            return 0, True
        # If the game is not terminated, return a value of 0
        return 0, False
    
    def get_opponent(self, player):
        # Return the opponent of a given player
        return -player
    
    def get_opponent_value(self, value):
        # Return the opponent's value given the player's value
        return -value
    
    def change_perspective(self, state, player):
        # Change the perspective of the state to the given player
        return state * player
    
    def get_encoded_state(self, state):
        # Encode the state in a suitable format for training a machine learning model
        encoded_state = np.stack(
            (state == -1, state == 0, state == 1)
        ).astype(np.float32)
        
        if len(state.shape) == 3:
            encoded_state = np.swapaxes(encoded_state, 0, 1)
        
        return encoded_state
    
# class TicTacToe:
#     def __init__(self):
#         self.row_count = 3
#         self.column_count = 3
#         self.action_size = self.row_count * self.column_count
        
#     def __repr__(self):
#         return "TicTacToe"
        
#     def get_initial_state(self):
#         return np.zeros((self.row_count, self.column_count))
    
#     def get_next_state(self, state, action, player):
#         row = action // self.column_count
#         column = action % self.column_count
#         state[row, column] = player
#         return state
    
#     def get_valid_moves(self, state):
#         if len(state.shape) == 3:
#             return (state.reshape(-1, 9) == 0).astype(np.uint8)
#         return (state.reshape(9) == 0).astype(np.uint8)
    
#     def check_win(self, state, action):
#         if action == None:
#             return False
        
#         row = action // self.column_count
#         column = action % self.column_count
#         player = state[row, column]
        
#         return (
#             np.sum(state[row, :]) == player * self.column_count
#             or np.sum(state[:, column]) == player * self.row_count
#             or np.sum(np.diag(state)) == player * self.row_count
#             or np.sum(np.diag(np.flip(state, axis=0))) == player * self.row_count
#         )
    
#     def get_value_and_terminated(self, state, action):
#         if self.check_win(state, action):
#             return 1, True
#         if np.sum(self.get_valid_moves(state)) == 0:
#             return 0, True
#         return 0, False
    
#     def get_opponent(self, player):
#         return -player
    
#     def get_opponent_value(self, value):
#         return -value
    
#     def change_perspective(self, state, player):
#         return state * player
    
#     def get_encoded_state(self, state):
#         encoded_state = np.stack(
#             (state == -1, state == 0, state == 1)
#         ).astype(np.float32)
        
#         if len(state.shape) == 3:
#             encoded_state = np.swapaxes(encoded_state, 0, 1)
        
#         return encoded_state

#### Random Agent

In [28]:
class RandomPlayer:
      def __init__(self, game, args):
            self.game = game
            self.args = args

      def selfPlay(self):
            np.random.seed(self.args['random_state'])
            memory = []
            player = 1
            state = self.game.get_initial_state()

            while True:
                  neutral_state = self.game.change_perspective(state, player)

                  memory.append((neutral_state, None, player))
                  action = np.random.choice(self.game.action_size, )

                  state = self.game.get_next_state(state, action, player)

                  time.sleep(0.5)
                  print(
                        f"Player {player} played action {action} and the state is now:"
                  )
                  print(state)
                  print()

                  value, is_terminal = self.game.get_value_and_terminated(state, action)

                  if is_terminal:
                        returnMemory = []
                        for hist_neutral_state, hist_action_probs, hist_player in memory:
                              hist_outcome = value if hist_player == player else self.game.get_opponent_value(value)
                              returnMemory.append((
                                    self.game.get_encoded_state(hist_neutral_state),
                                    hist_action_probs,
                                    hist_outcome
                              ))
                        return returnMemory

                  player = self.game.get_opponent(player)

In [29]:
# from games import ConnectFour
# from agents.RandomPlayer import RandomPlayer

# if __name__ == "__main__":
#       game = ConnectFour()
#       args = {
#             'random_state': 42
#             }
#       player = RandomPlayer(game, args)
#       player.selfPlay()



#### DeepQ Agent

In [30]:
# class DQN(nn.Module):
#     def __init__(self, state_size, action_size):
#         super(DQN, self).__init__()
#         self.fc1 = nn.Linear(state_size, 20)
#         self.fc2 = nn.Linear(20, 50)
#         self.fc3 = nn.Linear(50, action_size)
    
#     def forward(self, x):
#         x = torch.relu(self.fc1(x))
#         x = torch.relu(self.fc2(x))
#         return self.fc3(x)



In [31]:
# class DQNAgent:
#     def __init__(self, state_size, action_size, episodes):
#         self.state_size = state_size
#         self.action_size = action_size
#         self.memory = deque(maxlen=500)
#         self.gamma = 0.9   # discount rate
#         self.epsilon = 0.10  # initial exploration rate
#         self.epsilon_min = 0.01
#         self.epsilon_decay = exp((log(self.epsilon_min) - log(self.epsilon))/(0.8*episodes)) # reaches epsilon_min after 80% of iterations
#         self.model = DQN(self.state_size, self.action_size)
#         self.criterion = nn.MSELoss()
#         self.optimizer = optim.Adam(self.model.parameters(), lr=0.00001)
    
#     def memorize(self, state, action, reward, next_state, done):
#         self.memory.append((state, action, reward, next_state, done))
    
#     def act(self, state):
#         if np.random.rand() <= self.epsilon: # Exploration
#             return choice([c for c in range(self.action_size) if state[:,c] == 0])
#         state = torch.from_numpy(state).float().unsqueeze(0)
#         with torch.no_grad():
#             act_values = self.model(state)
#         action = torch.argmax(act_values).item()
#         return action
    
#     def replay(self, batch_size):
#         minibatch = random.sample(self.memory, batch_size)
#         for state, action, reward, next_state, done in minibatch:
#             state = torch.from_numpy(state).float().unsqueeze(0)
#             next_state = torch.from_numpy(next_state).float().unsqueeze(0)
#             reward = torch.tensor([reward]).float()
#             action = torch.tensor([action])
#             done = torch.tensor([done]).float()

#             if not done:
#                 target = reward + self.gamma * torch.max(self.model(next_state)).item()
#             else:
#                 target = reward
#             current = self.model(state)[0][action].item()
#             loss = self.criterion(target, current)
#             self.optimizer.zero_grad()
#             loss.backward()
#             self.optimizer.step()
            
#         if self.epsilon > self.epsilon_min:
#             self.epsilon *= self.epsilon_decay
    
#     def load(self, name):
#         self.model.load_state_dict(torch.load(name))
    
#     def save(self, name):
#         torch.save(self.model.state_dict(), name)

# Deep Q-learning Agent
class DQNAgent:

    def __init__(self, state_size, action_size, episodes, environment):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=500)
        self.gamma = 0.9   # discount rate
        self.epsilon = 0.10  # initial exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = exp((log(self.epsilon_min) - log(self.epsilon))/(0.8*episodes)) # reaches epsilon_min after 80% of iterations
        self.model = self._build_model()
        self.env = environment  # ConnectFour or TicTacToe environment

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_size, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr = 0.00001))
        return model

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        valid_moves = self.env.get_valid_moves(state)  # Get valid moves for current state
        if np.random.rand() <= self.epsilon: # Exploration
            return choice([c for c in np.where(valid_moves == 1)[0]])
        act_values = self.model.predict(state) # Exploitation
        # Remove invalid actions by setting their value to negative infinity
        act_values[0][np.where(valid_moves == 0)[0]] = -np.inf
        action = np.argmax(act_values[0])
        return action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)
    
    def save(self, name):
        self.model.save_weights(name)

#### Training

In [32]:
# # initialize gym environment and the agent
# env = ConnectFour()
# state_size = env.row_count * env.column_count * 3
# action_size = env.action_size
# episodes = 40000
# agent = DQNAgent(state_size, action_size, episodes)
# # agent.load("./connectX-weights_deep.pth") # commented out
# batch_size = 40

# # Monitoring devices
# all_total_rewards = np.empty(episodes)
# all_avg_rewards = np.empty(episodes)

# # Iterate the game
# for e in range(episodes):
#     # reset state in the beginning of each game
#     done = False
#     state = env.get_initial_state()
#     total_rewards = 0
#     while not done:
#         # Decide action
#         action = int(agent.act(state))
#         next_state = env.get_next_state(state, action, 1)
#         value, done = env.get_value_and_terminated(next_state, action)
#         reward = value if done else 0.0
        
#         # invalid move: hard penalization
#         if env.get_valid_moves(state)[action] == 0:
#             reward = -10
#         agent.memorize(state, action, reward, next_state, done)

#         # make next_state the new current state for the next frame.
#         state = next_state
#         total_rewards += reward

#     # experience replay
#     if len(agent.memory) > batch_size:
#         agent.replay(batch_size)
        
#     all_total_rewards[e] = total_rewards
#     avg_reward = all_total_rewards[max(0, e - 100):e].mean()
#     all_avg_rewards[e] = avg_reward
#     if e % 100 == 0 :
#         agent.save("./connectX-weights_deep.pth")
#         print("episode: {}/{}, epsilon: {:.2f}, average: {:.2f}".format(e, episodes, agent.epsilon, avg_reward))

# initialize gym environment and the agent
env = ConnectFour()
state_size = env.row_count * env.column_count * 3  # state representation is 3 channels of the board
action_size = env.column_count
episodes = 40000
agent = DQNAgent(state_size, action_size, episodes, env)
# agent.load("./connectX-weights_deep.h5") # load prelearned weights
batch_size = 40 # Don't know if this number makes sense

# Monitoring devices
all_total_rewards = np.empty(episodes)
all_avg_rewards = np.empty(episodes)

# Iterate the game
for e in range(episodes):
    # reset state in the beginning of each game
    state = env.get_initial_state()
    state = env.get_encoded_state(state)
    state = np.expand_dims(state, axis=0)  # add a dimension for batch compatibility
    total_rewards = 0
    player = 1
    done = False
    while not done:
        # Decide action
        action = agent.act(state)
        next_state = env.get_next_state(np.copy(state), action, player)
        reward, done = env.get_value_and_terminated(next_state, action)
        if env.get_valid_moves(state)[action] == 0:  # invalid move
            reward = -10
        agent.memorize(state, action, reward, next_state, done)
        # make next_state the new current state for the next frame.
        state = next_state
        player = env.get_opponent(player)  # switch player
        total_rewards += reward
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
        all_total_rewards[e] = total_rewards
        avg_reward = all_total_rewards[max(0, e - 100):e].mean()
        all_avg_rewards[e] = avg_reward
        if e % 100 == 0 :
            agent.save("./connectX-weights_deep.h5")
            print("episode: {}/{}, epsilon: {:.2f}, average: {:.2f}".format(e, episodes, agent.epsilon, avg_reward))




ValueError: in user code:

    File "c:\Users\Felix\.virtualenvs\diy_alphazero-3BngPyJJ\lib\site-packages\keras\src\engine\training.py", line 2341, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\Felix\.virtualenvs\diy_alphazero-3BngPyJJ\lib\site-packages\keras\src\engine\training.py", line 2327, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Felix\.virtualenvs\diy_alphazero-3BngPyJJ\lib\site-packages\keras\src\engine\training.py", line 2315, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\Felix\.virtualenvs\diy_alphazero-3BngPyJJ\lib\site-packages\keras\src\engine\training.py", line 2283, in predict_step
        return self(x, training=False)
    File "c:\Users\Felix\.virtualenvs\diy_alphazero-3BngPyJJ\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Felix\.virtualenvs\diy_alphazero-3BngPyJJ\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 126), found shape=(None, 3, 6, 7)
