In [1]:
import gym
import numpy as np

In [2]:
#create the environment
env = gym.make('Acrobot-v1')
env.reset()

array([ 0.99833826, -0.05762561,  0.99957907,  0.02901174,  0.08938744,
       -0.05122174])

In [3]:
#how many actions we can take
action_size = env.action_space.n
action_size

3

In [4]:
#discretize the obs space into buckets
buckets = 15 
discrete_obs_size = [buckets] * len(env.observation_space.low)  
bucket_size = (env.observation_space.high-env.observation_space.low)/discrete_obs_size

#create q table to match all possible states with actions
q_table = np.random.uniform(high = 3, low = 0, size = (discrete_obs_size + [env.action_space.n])) 

In [None]:
#used to make continuous state outputs discrete
def discretize(state, bucket_size, max_size = 14):
    discrete_state = (state - env.observation_space.low)/bucket_size
    
    #clip to make sure it the state format never exceeds the bucket size 
    discrete_state = np.clip(discrete_state.astype(np.int), None, max_size)
    
    return tuple(discrete_state)

In [None]:
#set hyperparameters
lr = 0.1
discount_rate = 0.95
episodes = 25000        
discount_factor = 0.9

#exploration vs exploitation factor
eps = 1
eps_start_decay = 1
eps_end_decay = 3*episodes//4
eps_decay_rate = eps/(eps_end_decay-eps_start_decay)

In [None]:
#learning loop
for i in range(episodes):
    
    #reset the environment
    state = discretize(env.reset(), bucket_size)
    done = False
    
    #run an episode until completion
    while not done:
        
        if np.random.uniform(0,1) < eps:
            #get random action for exploration
            action = env.action_space.sample()
        else:
            #get action based on current q_table
            action = np.argmax(q_table[state])
        
        #take action and recieve new state
        new_state, reward, done, _ = env.step(action)
        new_state = discretize(new_state, bucket_size)
        
        if not done:
            #update q table
            q_table[state][action] = q_table[state][action] + lr*(reward + discount_factor*np.max(q_table[new_state]) - q_table[state][action])
        elif done == True:
            q_table[state][action] = reward 
        
        #update state
        state = new_state
    
    #slowly decrease exploration factor 
    if eps_start_decay<= i <eps_end_decay:
        eps -= eps_decay_rate
    
    if i%50 == 0:
        print("Episodes: {}/{}     Epsilon:{}".format(i, episodes,eps))
    
        
env.close()


Episodes: 0/25000     Epsilon:1
Episodes: 50/25000     Epsilon:0.9973331911035268
Episodes: 100/25000     Epsilon:0.9946663822070536
Episodes: 150/25000     Epsilon:0.9919995733105804
Episodes: 200/25000     Epsilon:0.9893327644141072
Episodes: 250/25000     Epsilon:0.986665955517634
Episodes: 300/25000     Epsilon:0.9839991466211608
Episodes: 350/25000     Epsilon:0.9813323377246876
Episodes: 400/25000     Epsilon:0.9786655288282144
Episodes: 450/25000     Epsilon:0.9759987199317413
Episodes: 500/25000     Epsilon:0.9733319110352681
Episodes: 550/25000     Epsilon:0.9706651021387949
Episodes: 600/25000     Epsilon:0.9679982932423217
Episodes: 650/25000     Epsilon:0.9653314843458485
Episodes: 700/25000     Epsilon:0.9626646754493753
Episodes: 750/25000     Epsilon:0.9599978665529021
Episodes: 800/25000     Epsilon:0.9573310576564289
Episodes: 850/25000     Epsilon:0.9546642487599557
Episodes: 900/25000     Epsilon:0.9519974398634825
Episodes: 950/25000     Epsilon:0.9493306309670093
E

Episodes: 7950/25000     Epsilon:0.5759773854607622
Episodes: 8000/25000     Epsilon:0.573310576564289
Episodes: 8050/25000     Epsilon:0.5706437676678158
Episodes: 8100/25000     Epsilon:0.5679769587713426
Episodes: 8150/25000     Epsilon:0.5653101498748694
Episodes: 8200/25000     Epsilon:0.5626433409783962
Episodes: 8250/25000     Epsilon:0.559976532081923
Episodes: 8300/25000     Epsilon:0.5573097231854498
Episodes: 8350/25000     Epsilon:0.5546429142889766
Episodes: 8400/25000     Epsilon:0.5519761053925034
Episodes: 8450/25000     Epsilon:0.5493092964960302
Episodes: 8500/25000     Epsilon:0.546642487599557
Episodes: 8550/25000     Epsilon:0.5439756787030838
Episodes: 8600/25000     Epsilon:0.5413088698066106
Episodes: 8650/25000     Epsilon:0.5386420609101374
Episodes: 8700/25000     Epsilon:0.5359752520136642
Episodes: 8750/25000     Epsilon:0.5333084431171911
Episodes: 8800/25000     Epsilon:0.5306416342207179
Episodes: 8850/25000     Epsilon:0.5279748253242447
Episodes: 8900/

Episodes: 15700/25000     Epsilon:0.1626220065073716
Episodes: 15750/25000     Epsilon:0.15995519761089702
Episodes: 15800/25000     Epsilon:0.15728838871442244
Episodes: 15850/25000     Epsilon:0.15462157981794786
Episodes: 15900/25000     Epsilon:0.15195477092147328
Episodes: 15950/25000     Epsilon:0.1492879620249987
Episodes: 16000/25000     Epsilon:0.1466211531285241
Episodes: 16050/25000     Epsilon:0.14395434423204953
Episodes: 16100/25000     Epsilon:0.14128753533557495
Episodes: 16150/25000     Epsilon:0.13862072643910037
Episodes: 16200/25000     Epsilon:0.13595391754262579
Episodes: 16250/25000     Epsilon:0.1332871086461512
Episodes: 16300/25000     Epsilon:0.13062029974967662
Episodes: 16350/25000     Epsilon:0.12795349085320204
Episodes: 16400/25000     Epsilon:0.12528668195672746
Episodes: 16450/25000     Epsilon:0.12261987306025288
Episodes: 16500/25000     Epsilon:0.1199530641637783
Episodes: 16550/25000     Epsilon:0.11728625526730371
Episodes: 16600/25000     Epsilon

Episodes: 23050/25000     Epsilon:3.5316248623434854e-13
Episodes: 23100/25000     Epsilon:3.5316248623434854e-13
Episodes: 23150/25000     Epsilon:3.5316248623434854e-13
Episodes: 23200/25000     Epsilon:3.5316248623434854e-13
Episodes: 23250/25000     Epsilon:3.5316248623434854e-13
Episodes: 23300/25000     Epsilon:3.5316248623434854e-13
Episodes: 23350/25000     Epsilon:3.5316248623434854e-13
Episodes: 23400/25000     Epsilon:3.5316248623434854e-13
Episodes: 23450/25000     Epsilon:3.5316248623434854e-13
Episodes: 23500/25000     Epsilon:3.5316248623434854e-13
Episodes: 23550/25000     Epsilon:3.5316248623434854e-13
Episodes: 23600/25000     Epsilon:3.5316248623434854e-13
Episodes: 23650/25000     Epsilon:3.5316248623434854e-13
Episodes: 23700/25000     Epsilon:3.5316248623434854e-13
Episodes: 23750/25000     Epsilon:3.5316248623434854e-13
Episodes: 23800/25000     Epsilon:3.5316248623434854e-13
Episodes: 23850/25000     Epsilon:3.5316248623434854e-13
Episodes: 23900/25000     Epsil

In [None]:
#test environment with q table
state = discretize(env.reset(), bucket_size)
done = False
while not done:
    env.render()
    action = np.argmax(q_table[state])
    new_state, reward, done, _ = env.step(action)
    state = discretize(new_state, bucket_size)
env.close()