In [69]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy.signal import convolve2d
import matplotlib.pyplot as plt
from enum import Enum

In [61]:
def visualize(board):
    plt.axes()
    rectangle=plt.Rectangle((-0.5,len(board)*-1+0.5),len(board[0]),len(board),fc='blue')
    circles=[]
    for i,row in enumerate(board):
        for j,val in enumerate(row):
            color='white' if val==0 else 'red' if val==1 else 'yellow'
            circles.append(plt.Circle((j,i*-1),0.4,fc=color))

    plt.gca().add_patch(rectangle)
    for circle in circles:
        plt.gca().add_patch(circle)

    plt.axis('scaled')
    plt.show()

## Player Classes

### Base Player class
This is to be inherited and treated as a virtual class

In [91]:
class BasePlayer:
    def __init__(self,player_symbol) -> None:
        self.player_symbol = player_symbol
        self.num_rows = 0
        self.num_cols = 0
    def choose_action(self,valid_actions=None,board=None):
        raise NotImplementedError("Player class is virtual, please inherent from it")
    def give_reward(self,reward):
        raise NotImplementedError("Player class is virtual, please inherent from it")
    def reset(self):
        raise NotImplementedError("Player class is virtual, please inherent from it")
    


### Random Player

In [53]:
class RandomPlayer(BasePlayer):
    def __init__(self, player_symbol) -> None:
        super().__init__(player_symbol)
    def choose_action(self, valid_actions=None, board=None):
        return np.random.choice(valid_actions)
    def give_reward(self, reward):
        pass
    def reset(self):
        pass

In [86]:
class ACTION_REASONING(Enum):
    RANDOM = 1
    POLICY = 2

class QPlayer(BasePlayer):
    def __init__(self, player_symbol,epsilon = 0.5,prop_factor=0.5,alpha=0.4) -> None:
        super().__init__(player_symbol)
        self.initial_epsilon = epsilon
        self.epsilon = epsilon
        self.action_history = []
        self.action_reasoning = []
        self.board_history = []
        self.alpha = alpha
        self.prop_factor = prop_factor
        self.q_table = {}
    def get_board_hash(self,board):
        return str(board)
    def choose_action(self, valid_actions=None, board=None):
        action = None
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(valid_actions)
            self.action_history.append(action)
            self.action_reasoning.append(ACTION_REASONING.RANDOM)
            self.board_history.append(board)
        else:
            st_reward = self.q_table.get(self.get_board_hash(board),None)
            if st_reward:
                action = np.random.choice(np.where(st_reward == max(st_reward))[0]) #Choose a random action from all actions that have the highest q values
                self.action_history.append(action)
                self.action_reasoning.append(ACTION_REASONING.POLICY)
                self.board_history.append(board)

            else:
                action = np.random.choice(valid_actions)
                self.action_history.append(action)
                self.action_reasoning.append(ACTION_REASONING.POLICY)
                self.board_history.append(board)

        return action
    def reset(self):
        self.epsilon = self.initial_epsilon
        self.action_history = []
        self.action_reasoning = []
        self.board_history = []
    
    def give_reward(self, reward):
        for i,(action,reason,board) in enumerate(zip(reversed(self.action_history),reversed(self.action_reasoning),reversed(self.board_history))): #Iterate over all action,board,reasons backwards from start
            if reason == ACTION_REASONING.RANDOM:
                i = i -1
                continue


            st_reward = self.q_table.get(self.get_board_hash(board))
            if st_reward:
                st_reward[action] = st_reward[action] * (1-self.alpha) + (self.alpha * )
            else:
                self.q_table[self.get_board_hash(board)] = [reward * np.pow(self.prop_factor,i) if j == action else 0 for j in range(self.num_cols)]






In [93]:
a = [1,2,3]
b = ['a','b','c']
for j,x in enumerate(a):
    print(j,x)
    if x == 2:
        j = j -1


0 1
1 2
2 3


In [85]:
tmp = np.array([0.9,0.9,0.7,0.5])
np.argmax(tmp)
np.where(tmp == max(tmp))[0]

array([0, 1])

## Board Class

In [92]:
class Board:
    def __init__(self,p1:BasePlayer,p2:BasePlayer,num_rows=7,num_cols=7):
        self.num_rows= num_rows
        self.num_cols = num_cols
        self.p1 = p1
        self.p2 = p2
        self.p1.num_rows = num_rows
        self.p1.num_cols = num_cols
        self.p2.num_rows = num_rows
        self.p2.num_cols = num_cols
        self.board = np.zeros(shape=(num_rows,num_cols),dtype=int)
    def check_win(self):
        
        if(len(self.get_valid_moves()) == 0): return None
        horizontal_kernel = np.array([[ 1, 1, 1, 1]])
        vertical_kernel = np.transpose(horizontal_kernel)
        diag1_kernel = np.eye(4, dtype=np.uint8)
        diag2_kernel = np.fliplr(diag1_kernel)
        detection_kernels = [horizontal_kernel, vertical_kernel, diag1_kernel, diag2_kernel]
        for kernel in detection_kernels:

            a = convolve2d(self.board,kernel,mode='valid')
            if( (a == 4).any()):
                return 1
            if ((a == -4).any()):
                return -1
            
            # print(a.any())
        return 0
    def get_valid_moves(self):
        return [i for i,j in enumerate(self.board[0]) if j == 0]
    def reset(self):
        self.board = np.zeros(shape=(self.num_rows,self.num_cols))
    def __str__(self) -> str:
        visualize(self.board)
        return ""
    def place(self,move,player_symbol=1):
        if move not in self.get_valid_moves():
            raise RuntimeError(f"Invalid Action {move} with board state: \n{str(self.board)}")
        col = self.board[:,move]
        idx = int(np.where(col == 0)[0][-1])
        col[idx] = player_symbol
    def play_agents(self,verbose=True):
        
        while self.check_win() == 0:
            valid_actions = self.get_valid_moves()
            p1_action = self.p1.choose_action(valid_actions=valid_actions,board=self.board)
            self.place(p1_action,player_symbol=self.p1.player_symbol)

            if verbose:
                print(self)
            if self.check_win() == self.p1.player_symbol:
                print(f"Player {self.p1.player_symbol} Wins!")
                return self.p1.player_symbol
            else:
                valid_actions = self.get_valid_moves()
                p2_action = self.p2.choose_action(valid_actions=valid_actions,board=self.board)
                self.place(p2_action,player_symbol=self.p2.player_symbol)

                if verbose:
                    print(self)

                if self.check_win() == self.p2.player_symbol:
                    print(f"Player {self.p2.player_symbol} Wins!")
                    return self.p2.player_symbol

        return self.check_win()

        