In [1]:
import random
import chess
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os
# Create the chess environment
env = chess.Board()
print(env)
# Define the state size and action size for the DQN
state_size = 64  # Assuming a flattened 8x8 chess board representation
action_size = len(list(env.legal_moves))

# Hyperparameters
batch_size = 32
n_episodes = 300 #1000
output_dir = "model_output/chess/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the DQNAgent class
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Initialize the agent with state size, action size, and other hyperparameters
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)  # Create a memory buffer to store experiences
        self.gamma = 0.95  # Discount factor for future rewards
        self.epsilon = 1.0  # Exploration rate, starting from fully exploring
        self.epsilon_decay = 0.995  # Decay rate for exploration over time
        self.epsilon_min = 0.01  # Minimum exploration rate
        self.learning_rate = 0.001  # Learning rate for the neural network
        self.model = self._build_model()  # Build the neural network model

    def _build_model(self):
        # Neural network model for the DQN
        model = Sequential()
        model.add(Dense(32, activation="relu", input_dim=self.state_size))  # Input layer with 32 units and ReLU activation
        model.add(Dense(32, activation="relu"))  # Hidden layer with 32 units and ReLU activation
        model.add(Dense(self.action_size, activation="linear"))  # Output layer with action size and linear activation
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))  # Compile the model with MSE loss and Adam optimizer
        return model

    def remember(self, state, action, reward, next_state, done):
        # Add a transition to the agent's memory buffer
        self.memory.append((state, action, reward, next_state, done))

    def train(self, batch_size):
        # Train the DQN using experience replay
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)  # Sample a minibatch from the memory buffer
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])  # Update the target Q-value using Bellman equation
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)  # Update the DQN model

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  # Decay exploration rate over time

    def act(self, state):
        # Choose an action using epsilon-greedy exploration
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)  # Explore by choosing a random action
        act_values = self.model.predict(state)  # Exploit by selecting action with highest Q-value
        return np.argmax(act_values[0])

    def save(self, name):
        # Save the trained model to a file
        self.model.save_weights(os.path.join(output_dir, name))

# Function to convert the board to the model's input format
def board_to_input(board):
    # Convert the chess board to a 1D array of piece positions
    piece_map = board.piece_map()
    pieces = [0] * state_size

    for square, piece in piece_map.items():
        index = 8 * (7 - square // 8) + square % 8
        piece_value = {
            'P': 1, 'N': 2, 'B': 3, 'R': 4, 'Q': 5, 'K': 6,  # Map chess pieces to numerical values for the neural network
            'p': -1, 'n': -2, 'b': -3, 'r': -4, 'q': -5, 'k': -6
        }.get(piece.symbol(), 0)
        pieces[index] = piece_value
    
    return np.array(pieces)

# Reward function
def reward_function(board):
    # Check if the game is over (win, lose, or draw)
    if board.is_checkmate():
        return 1.0  # Reward for winning
    elif board.is_stalemate() or board.is_insufficient_material() or board.is_seventyfive_moves():
        return 0.0  # Reward for draw
    else:
        return -0.1  # Small penalty for each move to encourage faster wins

# Initialize the agent
agent = DQNAgent(state_size, action_size)

# Training loop in enviroment
for episode in range(n_episodes):
    env.reset()  # Reset the board for a new episode
    #state = board_to_input(env).reshape(1, state_size)  # Convert the current board state to the model's input format, reshape rows not columns 
    done = False
    total_reward = 0

    while not done:
       # print(env)
        state = board_to_input(env).reshape(1, state_size)  # Convert the current board state to the model's input format, reshape rows not columns

        # Interact with the environment and collect experiences
        action = agent.act(state)  # Get the action to take in the current state

        legal_moves = list(env.legal_moves)  # Get the legal moves for the current state
        
        # Check if the selected action is within the valid range of legal moves
        if action < len(legal_moves):
            move = legal_moves[action]
        else:
            # If the action is out of range, select a random legal move
            move = random.choice(legal_moves)
        
        env.push(move)  # Apply the selected move to the board

        # Get the next state after the move and convert it to the model's input format
        next_state = board_to_input(env).reshape(1, state_size)

        reward = reward_function(env)  # Calculate the reward based on the game outcome

        # Check if the game is over after the move
        done = env.is_game_over()

        # Remember the experience in the agent's memory buffer
        agent.remember(state, action, reward, next_state, done)
        #print(env)
        total_reward += reward
        state = next_state

    agent.train(batch_size)  # Train the agent at the end of each episode

    # Print episode results
    print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon}")

    if (episode + 1) % 100 == 0:
        agent.save(f"chess_dqn_episode_{episode + 1}.h5")  # Save the model every 100 episodes

agent.save("final_chess_dqn_model.h5")  # Save the final trained model
        #print(next_state)

        # Calculate the reward and check if the game is done
        # You may need to implement your own logic to calculate the reward
        # and check if the game is done based on the board state.


        # reward = 0
        # def calculate_reward(board, action):
        #     # Check if the action results in a capture
        #     if board.is_capture(action):
        #         return 1  # Positive reward for capturing an opponent's piece
        #     elif board.is_checkmate():
        #         return 10  # Higher reward for checkmate, indicating a winning move
        #     elif board.is_stalemate() or board.is_insufficient_material():
        #         return -5  # Negative reward for stalemate or insufficient material, indicating a draw or unclear outcome
        #     else:
        #         return 0  # No reward for other moves  # add reward function here
        

        # # Function to check if the game is done (termination condition)
        # def is_game_done(board):
        #     return board.is_game_over()

        # # Remember the experience in the agent's replay memory
        # agent.remember(state, action, reward, next_state, done)

        # # Update the current state
        # state = next_state

2024-09-10 15:45:18.233834: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


r n b q k b n r
p p p p p p p p
. . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
P P P P P P P P
R N B Q K B N R
Episode: 1, Total Reward: -33.1000000000002, Epsilon: 0.995
Episode: 2, Total Reward: -27.000000000000114, Epsilon: 0.990025
Episode: 3, Total Reward: -33.2000000000002, Epsilon: 0.985074875
Episode: 4, Total Reward: -6.799999999999988, Epsilon: 0.9801495006250001
Episode: 5, Total Reward: -60.50000000000059, Epsilon: 0.9752487531218751
