In [1]:
import numpy as np
import random
import gym # Requires Box2D
import math
from collections import defaultdict, deque
import matplotlib.pyplot as graph

In [2]:
env = gym.make("BipedalWalker-v3") # Goal is for agent to be able to walk to the right side of the environment while maintaining its balance

In [3]:
print(env.reset()) # Finding out the parameters of the environment
print(env.step(env.reset()[0]))

(array([ 2.7473201e-03, -2.2742097e-05,  1.7691043e-03, -1.5999848e-02,
        9.1838166e-02, -2.3346161e-03,  8.6033911e-01,  3.1278164e-03,
        1.0000000e+00,  3.2243077e-02, -2.3344560e-03,  8.5389018e-01,
        1.6510626e-03,  1.0000000e+00,  4.4081411e-01,  4.4582021e-01,
        4.6142289e-01,  4.8955029e-01,  5.3410292e-01,  6.0246116e-01,
        7.0914906e-01,  8.8593203e-01,  1.0000000e+00,  1.0000000e+00],
      dtype=float32), {})
(array([-5.38658490e-03, -2.60649918e-04, -4.11777664e-03, -1.81089994e-03,
        4.37281638e-01,  8.00934061e-02,  1.12154365e-01, -9.84324217e-02,
        1.00000000e+00,  3.40151846e-01,  1.19122259e-01,  1.13815606e-01,
       -1.50700167e-01,  1.00000000e+00,  4.49641615e-01,  4.54747975e-01,
        4.70663071e-01,  4.99353766e-01,  5.44798553e-01,  6.14525735e-01,
        7.23350048e-01,  9.03673232e-01,  1.00000000e+00,  1.00000000e+00],
      dtype=float32), -0.37288930814394744, False, False, {})


In [4]:
# Parameters for Q-Learning
EPISODES = 1000 # Bot gets 1000 tries
GAMMA =  0.99
ALPHA = 0.01
HIGHSCORE = -200 

In [5]:
# Function to update the Q table
def updateQTable (Qtable, state, action, reward, nextState=None):
    global ALPHA
    global GAMMA

    current = Qtable[state][action]  
    qNext = np.max(Qtable[nextState]) if nextState is not None else 0
    target = reward + (GAMMA * qNext)
    new_value = current + (ALPHA * (target - current))
    return new_value

# State and action bounds
stateBounds = [(0, math.pi),
           (-2,2),
           (-1,1),
           (-1,1),
           (0,math.pi),
           (-2,2),
           (0, math.pi),
           (-2,2),
           (0,1),
           (0, math.pi),
           (-2, 2),
           (0, math.pi),
           (-2, 2),
           (0, 1)]

actionBounds = (-1, 1)

In [6]:
# Getting next action 
def getNextAction(qTable, epsilon, state):

    if random.random() < epsilon:

        action = ()
        for i in range (0, 4):
            action += (random.randint(0, 9),)

    else:

        action = np.unravel_index(np.argmax(qTable[state]), qTable[state].shape)

    return action

def convertNextAction(nextAction):
    action = []

    for i in range(len(nextAction)):

        nextVal = nextAction[i] / 9 * 2 - 1

        action.append(nextVal)

    return tuple(action)


In [7]:
def discretizeState(state): # Function to change continuos values into discrete ones

    discreteState = []
    for i in range(len(state)):
        index = (state[i]-stateBounds[i][0])  / (stateBounds[i][1]-stateBounds[i][0])*19
        discreteState.append(index)
    
    return tuple(discreteState)


In [8]:
# Function to step
def runAlgorithmStep(env, i, qTable):

    global HIGHSCORE


    print("Episode #: ", i)
    state = discretizeState(env.reset()[0][0:14])
    total_reward=  0
    epsilon = 1.0 / ( i * .004)
    limit = 0 # Limit amount of steps agent can take since there are situations where the agent could get indefinitely stuck 
    while True:
        
        nextAction = convertNextAction(getNextAction(qTable, epsilon, state))
        nextActionDiscretized = getNextAction(qTable, epsilon, state)
        nextState, reward, done, info,_ = env.step(nextAction)
        nextState = discretizeState(nextState[0:14])
        total_reward += reward
        qTable[state][nextActionDiscretized] = updateQTable(qTable, state, nextActionDiscretized, reward, nextState)
        state = nextState
        limit += 1
        if done or limit == 1600: # Limit is 1600 since the goal from the documentation is to get 300 points in 1600 time steps
            break
    
    if total_reward > HIGHSCORE:

        HIGHSCORE = total_reward

    return total_reward

In [9]:
def main():

    global HIGHSCORE

    env = gym.make("BipedalWalker-v3",render_mode="human")

    qTable = defaultdict( lambda: np.zeros((10, 10, 10, 10))) # Initialize q-table


    for i in range(1, EPISODES + 1):

        epScore = runAlgorithmStep(env, i, qTable)
        print(f"Episode finished. Score: {epScore}")

    
    print("All episodes finished. Highest score achieved: " + str(HIGHSCORE))

  
main()

Episode #:  1
Episode finished. Score: -120.00108777377444
Episode #:  2
Episode finished. Score: -99.96713639126304
Episode #:  3
Episode finished. Score: -99.57437657953054
Episode #:  4
Episode finished. Score: -118.21505306320596
Episode #:  5
Episode finished. Score: -103.16031370065615
Episode #:  6
Episode finished. Score: -88.1951659387008
Episode #:  7
Episode finished. Score: -108.32191707584718
Episode #:  8
Episode finished. Score: -89.51439491052822
Episode #:  9
Episode finished. Score: -116.84239633246987
Episode #:  10
Episode finished. Score: -111.09113412226489
Episode #:  11
Episode finished. Score: -99.81426778354205
Episode #:  12
Episode finished. Score: -102.57265456239598
Episode #:  13
Episode finished. Score: -87.42911926080289
Episode #:  14
Episode finished. Score: -106.56925113045942
Episode #:  15
Episode finished. Score: -101.71552227229252
Episode #:  16
Episode finished. Score: -98.54858874953082
Episode #:  17
Episode finished. Score: -102.139416558929