In [17]:
import numpy as np
import pandas as pd
import random

In [18]:
class GridBoard:
    def __init__(self, cols, rows):
        self.rows = rows
        self.cols = cols
        self.rewards = pd.DataFrame(np.zeros((rows, cols)))
        self.walls = []
        for w in self.walls:
            self.show_board[w[0]][w[1]] = "X"

        self.terminals = []  
        
        self.show_board = pd.DataFrame('.', index=range(self.rows), columns=range(self.cols))

    
    def assignRewards(self, row, col, reward):
        self.rewards[row][col] = reward
        self.terminals.append([row, col])
    
    def reset_showboard(self):
        self.show_board = pd.DataFrame('.', index=range(self.rows), columns=range(self.cols))
        for w in self.walls:
            self.show_board[w[0]][w[1]] = "X"
    
    def set_walls(self, walls):
        for w in walls:
            self.walls.append(w)




In [19]:
class Player:
    def __init__(self, grid, cs):
        self.curr_state = cs
        grid.show_board[self.curr_state[0]][self.curr_state[1]] = "*"
    
    def move(self, direct):
        # determine if a move can occur
        # possible values for act: N, S, E, W
        # need next_state to find walls

        #next_state = self.detNextState(self.curr_state, direct)

        if direct == 'N':
            next_state = [self.curr_state[0], self.curr_state[1]-1]
            if self.curr_state[1] != 0 and next_state not in grid.walls: self.curr_state[1] -= 1 

        elif direct == 'E':
            next_state = [self.curr_state[0]+1, self.curr_state[1]]
            if next_state[0] < grid.cols and next_state not in grid.walls: self.curr_state[0] += 1 

        elif direct == 'S':
            next_state = [self.curr_state[0], self.curr_state[1]+1]
            if next_state[1] < grid.rows and next_state not in grid.walls: self.curr_state[1] += 1 

        elif direct == 'W':
            next_state = [self.curr_state[0]-1, self.curr_state[1]]
            if self.curr_state[0] != 0 and next_state not in grid.walls: self.curr_state[0] -= 1 

        else:
            print("Invalid option")
        grid.reset_showboard()
        grid.show_board[self.curr_state[0]][self.curr_state[1]] = '*'
    
    def detNextState(self, curr_state, direct):

        if direct == 'N':
            pot_next_state = [self.curr_state[0], self.curr_state[1]-1]

            if self.curr_state[1] != 0 and pot_next_state not in grid.walls:
                next_state = [self.curr_state[0], self.curr_state[1]-1]
            else:
                next_state = curr_state

        elif direct == 'E':
            pot_next_state = [self.curr_state[0]+1, self.curr_state[1]]

            if curr_state[0] <= grid.cols-2 and pot_next_state not in grid.walls: 
                next_state = [self.curr_state[0]+1, self.curr_state[1]]
            else:
                next_state = curr_state

        elif direct == 'S':
            pot_next_state = [self.curr_state[0], self.curr_state[1]+1]

            if curr_state[1] <= grid.rows-2 and pot_next_state not in grid.walls:
                next_state = [self.curr_state[0], self.curr_state[1]+1]
            else:
                next_state = curr_state

        elif direct == 'W':
            pot_next_state = [self.curr_state[0]-1, self.curr_state[1]]

            if self.curr_state[0] != 0 and pot_next_state not in grid.walls:
                next_state = [self.curr_state[0]-1, self.curr_state[1]]
            else:
                next_state = curr_state

        else:
            print("Invalid option")
        
        return next_state
    
    def play(self):
        self.state_count = pd.DataFrame(np.zeros((grid.rows, grid.cols)))

        # q_tables are a list within a dataframe cell with the directions [North, East, South, West], always start at 0
        #print(grid.cols)
        #print(grid.rows)
        self.q_curr = pd.DataFrame([[[0, 0, 0, 0] for j in range(grid.cols)] for i in range(grid.rows)], index=range(grid.rows), columns=range(grid.cols))
    
    def findRandDirection(self):
        d = random.randint(1,4)
        if d == 1: return 'N'
        elif d == 2: return 'E'
        elif d == 3: return 'S'
        elif d == 4: return 'W'
        else:
            print("out of bounds")



            


In [20]:
class QLearn:
    def __init__(self, gamma, learningrate, deterministic):
        self.gamma = gamma
        if not deterministic:
            self.learningrate = learningrate
        else:
            self.learningrate = 1
    
    def findQhat(self, rewards, q_curr, cs, ns):
        #print(cs[0])
        #print(cs[1])
        #print(ns[0])
        #print(ns[1])

        #print(rewards)

        qhat = rewards[ns[0]][ns[1]] + self.gamma * max(q_curr[ns[0]][ns[1]][0], q_curr[ns[0]][ns[1]][1], q_curr[ns[0]][ns[1]][2], q_curr[ns[0]][ns[1]][3])
        return qhat

In [21]:
grid = GridBoard(3, 2)
#grid.set_walls([[2, 0], [2, 1]])
#print(grid.rewards)
#print(grid.walls)
#print(grid.q_curr)

for w in grid.walls:
    grid.show_board[w[0]][w[1]] = "X"    

grid.assignRewards(2, 0, 100)
print(grid.terminals)
#print(grid.rewards)

player = Player(grid, [0, 1])
player.play()
print(player.curr_state)
print(grid.show_board)

#print(player.findRandDirection())

qlearn = QLearn(0.9, None, True)

for i in range(0, 10):
    while player.curr_state not in grid.terminals:
        x = player.curr_state[0]
        y = player.curr_state[1]
        direct = player.findRandDirection()
        next_state = player.detNextState(player.curr_state, direct)
        #print(direct)

        d = 0
        if direct == 'N': d = 0
        elif direct == 'E': d = 1
        elif direct == 'S': d = 2
        elif direct == 'W': d = 3

        qhat = qlearn.findQhat(grid.rewards, player.q_curr, player.curr_state, next_state)

        player.q_curr[x][y][d] = qhat

        player.move(direct)

        #print(player.curr_state)
        #print(grid.show_board)

        

    player.curr_state = [0,1]
    #player.q_curr = 
    print(player.q_curr)


# tests all directional walls
test_path = ['W', 'N', 'E', 'S', 'S', 'S', 'S', 'S', 'W', 'W', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'N', 'N', 'N', 'N', 'N', 'N']

#for p in test_path:
    #player.move(p)

    #print(player.curr_state)

    #print(grid.show_board)


[[2, 0]]
[0, 1]
   0  1  2
0  .  .  .
1  *  .  .
                  0                 1             2
0      [0, 0, 0, 0]  [0, 100.0, 0, 0]  [0, 0, 0, 0]
1  [0, 0.0, 0, 0.0]    [0.0, 0, 0, 0]  [0, 0, 0, 0]
                    0                   1                   2
0        [0, 0, 0, 0]    [0, 100.0, 0, 0]        [0, 0, 0, 0]
1  [0, 0.0, 0.0, 0.0]  [0.0, 0.0, 0, 0.0]  [100.0, 0, 0, 0.0]
                      0                       1                   2
0       [0, 90.0, 0, 0]  [90.0, 100.0, 81.0, 0]        [0, 0, 0, 0]
1  [0.0, 0.0, 0.0, 0.0]   [90.0, 0.0, 0.0, 0.0]  [100.0, 0, 0, 0.0]
                          0                       1                         2
0  [81.0, 90.0, 72.9, 81.0]  [90.0, 100.0, 81.0, 0]              [0, 0, 0, 0]
1   [81.0, 81.0, 72.9, 0.0]  [90.0, 90.0, 0.0, 0.0]  [100.0, 90.0, 90.0, 0.0]
                          0                          1  \
0  [81.0, 90.0, 72.9, 81.0]  [90.0, 100.0, 81.0, 81.0]   
1  [81.0, 81.0, 72.9, 72.9]     [90.0, 90.0, 0.0, 0.0] 