In [1]:
import random
import math
import os.path

import numpy as np
import pandas as pd
import gym
import time

env = gym.make('CartPole-v0')
smart_actions = [0,1]

NUM_BUCKETS = (1, 1, 6, 3)  # (x, x', theta, theta')
NUM_ACTIONS = env.action_space.n # (left, right)
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
STATE_BOUNDS[1] = [-0.5, 0.5]
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]

## Learning related constants
MIN_EXPLORE_RATE = 0.01
MIN_LEARNING_RATE = 0.1

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [1]:
def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)
def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(1, 1.0 - math.log10((t+1)/10)))

def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.5, 1.0 - math.log10((t+1)/10)))

In [2]:
class QLearningTable:
    def __init__(self, actions,reward_decay=0.99):
        self.actions = actions  # a list
        self.gamma = reward_decay
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation, explore_rate):
        self.check_state_exist(observation)       
        if np.random.uniform() > explore_rate:
            # choose best action
            state_action = self.q_table.loc[observation, :]              
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)          
        return action

    def learn(self, s, a, r, s_,lr):
        self.check_state_exist(s_)
        self.check_state_exist(s)       
        q_predict = self.q_table.loc[s, a]
        
        if s_ != 'terminal':
            q_target = r + self.gamma * self.q_table.loc[s_, :].max()
        else:
            q_target = -1 #si la partie se fini, on donne un reward négatif
            
        # update
        self.q_table.loc[s, a] += lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))


    


In [4]:
qlearn = QLearningTable(actions=list(range(len(smart_actions))))
## Instantiating the learning related parameters
learning_rate = get_learning_rate(0)
explore_rate = get_explore_rate(0)


for episode in range(200):
    # Reset the environment
    obv = env.reset()
    
    # the initial state
    previous_state = state_to_bucket(obv)
    z=0
    
    for t in range(200):
        env.render()
          
        #select an action
        rl_action = qlearn.choose_action(str(previous_state),explore_rate) 
        
        #take an action 
        observation, reward, done, info = env.step(rl_action) 
        
        #observe the result
        bucket=state_to_bucket(observation) 
        
            
            
        if z==0:
        #learn from previous action
            qlearn.learn(str(previous_state), rl_action, reward,str(bucket),learning_rate) #learn from previous action
        elif z==20:
            print("Episode ",episode," finished after {} timesteps".format(t+1))
            break
        
        previous_state=bucket
        
        if done:
            z+=1
            #qlearn.learn(str(previous_state), rl_action, reward,str('terminal'),learning_rate)
            
    # Update parameters
    explore_rate = get_explore_rate(episode)
    learning_rate = get_learning_rate(episode)    
        

env.close()

Episode  0  finished after 19 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  0  finished after 20 timesteps
Episode  0  finished after 21 timesteps
Episode  0  finished after 22 timesteps
Episode  0  finished after 23 timesteps
Episode  0  finished after 24 timesteps
Episode  0  finished after 25 timesteps
Episode  0  finished after 26 timesteps
Episode  0  finished after 27 timesteps
Episode  0  finished after 28 timesteps
Episode  0  finished after 29 timesteps
Episode  0  finished after 30 timesteps
Episode  0  finished after 31 timesteps
Episode  0  finished after 32 timesteps
Episode  0  finished after 33 timesteps
Episode  0  finished after 34 timesteps
Episode  0  finished after 35 timesteps
Episode  0  finished after 36 timesteps
Episode  0  finished after 37 timesteps
Episode  0  finished after 3

Episode  8  finished after 9 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  8  finished after 10 timesteps
Episode  8  finished after 11 timesteps
Episode  8  finished after 12 timesteps
Episode  8  finished after 13 timesteps
Episode  8  finished after 14 timesteps
Episode  8  finished after 15 timesteps
Episode  8  finished after 16 timesteps
Episode  8  finished after 17 timesteps
Episode  8  finished after 18 timesteps
Episode  8  finished after 19 timesteps
Episode  8  finished after 20 timesteps
Episode  8  finished after 21 timesteps
Episode  8  finished after 22 timesteps
Episode  8  finished after 23 timesteps
Episode  8  finished after 24 timesteps
Episode  8  finished after 25 timesteps
Episode  8  finished after 26 timesteps
Episode  8  finished after 27 timesteps
Episode  8  finished after 28

Episode  16  finished after 16 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  16  finished after 17 timesteps
Episode  16  finished after 18 timesteps
Episode  16  finished after 19 timesteps
Episode  16  finished after 20 timesteps
Episode  16  finished after 21 timesteps
Episode  16  finished after 22 timesteps
Episode  16  finished after 23 timesteps
Episode  16  finished after 24 timesteps
Episode  16  finished after 25 timesteps
Episode  16  finished after 26 timesteps
Episode  16  finished after 27 timesteps
Episode  16  finished after 28 timesteps
Episode  16  finished after 29 timesteps
Episode  16  finished after 30 timesteps
Episode  16  finished after 31 timesteps
Episode  16  finished after 32 timesteps
Episode  16  finished after 33 timesteps
Episode  16  finished after 34 timesteps
Episode  

Episode  23  finished after 36 timesteps
Episode  23  finished after 37 timesteps
Episode  23  finished after 38 timesteps
Episode  23  finished after 39 timesteps
Episode  23  finished after 40 timesteps
Episode  23  finished after 41 timesteps
Episode  23  finished after 42 timesteps
Episode  23  finished after 43 timesteps
Episode  23  finished after 44 timesteps
Episode  24  finished after 27 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  24  finished after 28 timesteps
Episode  24  finished after 29 timesteps
Episode  24  finished after 30 timesteps
Episode  24  finished after 31 timesteps
Episode  24  finished after 32 timesteps
Episode  24  finished after 33 timesteps
Episode  24  finished after 34 timesteps
Episode  24  finished after 35 timesteps
Episode  24  finished after 36 timesteps
Episode  

Episode  31  finished after 25 timesteps
Episode  31  finished after 26 timesteps
Episode  31  finished after 27 timesteps
Episode  31  finished after 28 timesteps
Episode  31  finished after 29 timesteps
Episode  31  finished after 30 timesteps
Episode  31  finished after 31 timesteps
Episode  31  finished after 32 timesteps
Episode  31  finished after 33 timesteps
Episode  31  finished after 34 timesteps
Episode  32  finished after 31 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  32  finished after 32 timesteps
Episode  32  finished after 33 timesteps
Episode  32  finished after 34 timesteps
Episode  32  finished after 35 timesteps
Episode  32  finished after 36 timesteps
Episode  32  finished after 37 timesteps
Episode  32  finished after 38 timesteps
Episode  32  finished after 39 timesteps
Episode  

Episode  39  finished after 36 timesteps
Episode  39  finished after 37 timesteps
Episode  39  finished after 38 timesteps
Episode  39  finished after 39 timesteps
Episode  39  finished after 40 timesteps
Episode  39  finished after 41 timesteps
Episode  39  finished after 42 timesteps
Episode  39  finished after 43 timesteps
Episode  40  finished after 47 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  40  finished after 48 timesteps
Episode  40  finished after 49 timesteps
Episode  40  finished after 50 timesteps
Episode  40  finished after 51 timesteps
Episode  40  finished after 52 timesteps
Episode  40  finished after 53 timesteps
Episode  40  finished after 54 timesteps
Episode  40  finished after 55 timesteps
Episode  40  finished after 56 timesteps
Episode  40  finished after 57 timesteps
Episode  

Episode  47  finished after 38 timesteps
Episode  47  finished after 39 timesteps
Episode  47  finished after 40 timesteps
Episode  47  finished after 41 timesteps
Episode  47  finished after 42 timesteps
Episode  47  finished after 43 timesteps
Episode  47  finished after 44 timesteps
Episode  47  finished after 45 timesteps
Episode  47  finished after 46 timesteps
Episode  48  finished after 74 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  48  finished after 75 timesteps
Episode  48  finished after 76 timesteps
Episode  48  finished after 77 timesteps
Episode  48  finished after 78 timesteps
Episode  48  finished after 79 timesteps
Episode  48  finished after 80 timesteps
Episode  48  finished after 81 timesteps
Episode  48  finished after 82 timesteps
Episode  48  finished after 83 timesteps
Episode  

Episode  55  finished after 21 timesteps
Episode  55  finished after 22 timesteps
Episode  55  finished after 23 timesteps
Episode  55  finished after 24 timesteps
Episode  55  finished after 25 timesteps
Episode  55  finished after 26 timesteps
Episode  55  finished after 27 timesteps
Episode  55  finished after 28 timesteps
Episode  55  finished after 29 timesteps
Episode  56  finished after 16 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  56  finished after 17 timesteps
Episode  56  finished after 18 timesteps
Episode  56  finished after 19 timesteps
Episode  56  finished after 20 timesteps
Episode  56  finished after 21 timesteps
Episode  56  finished after 22 timesteps
Episode  56  finished after 23 timesteps
Episode  56  finished after 24 timesteps
Episode  56  finished after 25 timesteps
Episode  

Episode  63  finished after 22 timesteps
Episode  63  finished after 23 timesteps
Episode  63  finished after 24 timesteps
Episode  63  finished after 25 timesteps
Episode  63  finished after 26 timesteps
Episode  63  finished after 27 timesteps
Episode  63  finished after 28 timesteps
Episode  63  finished after 29 timesteps
Episode  63  finished after 30 timesteps
Episode  64  finished after 25 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  64  finished after 26 timesteps
Episode  64  finished after 27 timesteps
Episode  64  finished after 28 timesteps
Episode  64  finished after 29 timesteps
Episode  64  finished after 30 timesteps
Episode  64  finished after 31 timesteps
Episode  64  finished after 32 timesteps
Episode  64  finished after 33 timesteps
Episode  64  finished after 34 timesteps
Episode  

Episode  71  finished after 35 timesteps
Episode  71  finished after 36 timesteps
Episode  71  finished after 37 timesteps
Episode  71  finished after 38 timesteps
Episode  71  finished after 39 timesteps
Episode  71  finished after 40 timesteps
Episode  71  finished after 41 timesteps
Episode  71  finished after 42 timesteps
Episode  71  finished after 43 timesteps
Episode  72  finished after 8 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  72  finished after 9 timesteps
Episode  72  finished after 10 timesteps
Episode  72  finished after 11 timesteps
Episode  72  finished after 12 timesteps
Episode  72  finished after 13 timesteps
Episode  72  finished after 14 timesteps
Episode  72  finished after 15 timesteps
Episode  72  finished after 16 timesteps
Episode  72  finished after 17 timesteps
Episode  72

Episode  79  finished after 25 timesteps
Episode  79  finished after 26 timesteps
Episode  79  finished after 27 timesteps
Episode  79  finished after 28 timesteps
Episode  79  finished after 29 timesteps
Episode  79  finished after 30 timesteps
Episode  79  finished after 31 timesteps
Episode  79  finished after 32 timesteps
Episode  79  finished after 33 timesteps
Episode  79  finished after 34 timesteps
Episode  80  finished after 15 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  80  finished after 16 timesteps
Episode  80  finished after 17 timesteps
Episode  80  finished after 18 timesteps
Episode  80  finished after 19 timesteps
Episode  80  finished after 20 timesteps
Episode  80  finished after 21 timesteps
Episode  80  finished after 22 timesteps
Episode  80  finished after 23 timesteps
Episode  

Episode  87  finished after 24 timesteps
Episode  87  finished after 25 timesteps
Episode  87  finished after 26 timesteps
Episode  87  finished after 27 timesteps
Episode  87  finished after 28 timesteps
Episode  87  finished after 29 timesteps
Episode  87  finished after 30 timesteps
Episode  87  finished after 31 timesteps
Episode  87  finished after 32 timesteps
Episode  87  finished after 33 timesteps
Episode  88  finished after 136 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  88  finished after 137 timesteps
Episode  88  finished after 138 timesteps
Episode  88  finished after 139 timesteps
Episode  88  finished after 140 timesteps
Episode  88  finished after 141 timesteps
Episode  88  finished after 142 timesteps
Episode  88  finished after 143 timesteps
Episode  88  finished after 144 timesteps


Episode  95  finished after 128 timesteps
Episode  95  finished after 129 timesteps
Episode  95  finished after 130 timesteps
Episode  95  finished after 131 timesteps
Episode  95  finished after 132 timesteps
Episode  95  finished after 133 timesteps
Episode  95  finished after 134 timesteps
Episode  95  finished after 135 timesteps
Episode  96  finished after 60 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  96  finished after 61 timesteps
Episode  96  finished after 62 timesteps
Episode  96  finished after 63 timesteps
Episode  96  finished after 64 timesteps
Episode  96  finished after 65 timesteps
Episode  96  finished after 66 timesteps
Episode  96  finished after 67 timesteps
Episode  96  finished after 68 timesteps
Episode  96  finished after 69 timesteps
Episode  96  finished after 70 timesteps
E

Episode  104  finished after 157 timesteps
Episode  104  finished after 158 timesteps
Episode  104  finished after 159 timesteps
Episode  104  finished after 160 timesteps
Episode  104  finished after 161 timesteps
Episode  104  finished after 162 timesteps
Episode  104  finished after 163 timesteps
Episode  104  finished after 164 timesteps
Episode  104  finished after 165 timesteps
Episode  105  finished after 29 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  105  finished after 30 timesteps
Episode  105  finished after 31 timesteps
Episode  105  finished after 32 timesteps
Episode  105  finished after 33 timesteps
Episode  105  finished after 34 timesteps
Episode  105  finished after 35 timesteps
Episode  105  finished after 36 timesteps
Episode  105  finished after 37 timesteps
Episode  105  finished 

Episode  112  finished after 25 timesteps
Episode  112  finished after 26 timesteps
Episode  112  finished after 27 timesteps
Episode  112  finished after 28 timesteps
Episode  112  finished after 29 timesteps
Episode  112  finished after 30 timesteps
Episode  112  finished after 31 timesteps
Episode  112  finished after 32 timesteps
Episode  112  finished after 33 timesteps
Episode  113  finished after 53 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  113  finished after 54 timesteps
Episode  113  finished after 55 timesteps
Episode  113  finished after 56 timesteps
Episode  113  finished after 57 timesteps
Episode  113  finished after 58 timesteps
Episode  113  finished after 59 timesteps
Episode  113  finished after 60 timesteps
Episode  113  finished after 61 timesteps
Episode  113  finished after 62 

Episode  120  finished after 180 timesteps
Episode  120  finished after 181 timesteps
Episode  120  finished after 182 timesteps
Episode  120  finished after 183 timesteps
Episode  120  finished after 184 timesteps
Episode  120  finished after 185 timesteps
Episode  120  finished after 186 timesteps
Episode  120  finished after 187 timesteps
Episode  120  finished after 188 timesteps
Episode  120  finished after 189 timesteps
Episode  121  finished after 17 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  121  finished after 18 timesteps
Episode  121  finished after 19 timesteps
Episode  121  finished after 20 timesteps
Episode  121  finished after 21 timesteps
Episode  121  finished after 22 timesteps
Episode  121  finished after 23 timesteps
Episode  121  finished after 24 timesteps
Episode  121  finished

Episode  129  finished after 67 timesteps
Episode  129  finished after 68 timesteps
Episode  129  finished after 69 timesteps
Episode  129  finished after 70 timesteps
Episode  129  finished after 71 timesteps
Episode  129  finished after 72 timesteps
Episode  129  finished after 73 timesteps
Episode  129  finished after 74 timesteps
Episode  129  finished after 75 timesteps
Episode  130  finished after 20 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  130  finished after 21 timesteps
Episode  130  finished after 22 timesteps
Episode  130  finished after 23 timesteps
Episode  130  finished after 24 timesteps
Episode  130  finished after 25 timesteps
Episode  130  finished after 26 timesteps
Episode  130  finished after 27 timesteps
Episode  130  finished after 28 timesteps
Episode  130  finished after 29 

Episode  137  finished after 50 timesteps
Episode  137  finished after 51 timesteps
Episode  137  finished after 52 timesteps
Episode  137  finished after 53 timesteps
Episode  137  finished after 54 timesteps
Episode  137  finished after 55 timesteps
Episode  137  finished after 56 timesteps
Episode  137  finished after 57 timesteps
Episode  137  finished after 58 timesteps
Episode  138  finished after 20 timesteps
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
Episode  138  finished after 21 timesteps
Episode  138  finished after 22 timesteps
Episode  138  finished after 23 timesteps
Episode  138  finished after 24 timesteps
Episode  138  finished after 25 timesteps
Episode  138  finished after 26 timesteps
Episode  138  finished after 27 timesteps
Episode  138  finished after 28 timesteps
Episode  138  finished after 29 

KeyboardInterrupt: 

In [None]:
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))
state_0=(0,0,1,0)
action=1
q_table[state_0 + (action,)]=1

q_table[(0,0,1,0)]