In [33]:
import gym
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import math, random
from typing import Tuple
import matplotlib.pyplot as plt
import time

In [34]:
env = gym.make('CartPole-v0')

In [35]:
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]

In [36]:
#observation gÃ¥r fra 0-3
#0 cart position
#1 cart velocity
#2 pole angle
#3 pole velocity at tip

In [37]:
n_bins = (6,12)

#upper og lower bounds
upper_bounds = [
    env.observation_space.high[2], math.radians(50)
    ]
lower_bounds = [
    env.observation_space.low[2], -math.radians(50)
    ]
Q = np.zeros(n_bins + (n_actions,))

In [38]:
def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [39]:
def policy (state : tuple):
    return np.argmax(Q[state])

In [40]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    future_optimal_value = np.max(Q[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [41]:
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

def exploration_rate(n : int, min_rate=0.15 ) -> float :
    return max(min_rate, min(1.0, 1.0 - math.log10((n  + 1) / 25)))

In [42]:
n_episodes = 10000 
for e in range(n_episodes):
    
    # Siscretize state into buckets
    current_state, done = discretizer(*env.reset()), False
    
    while done==False:
        
        # policy action 
        action = policy(current_state) # exploit
        
        # insert random action
        if np.random.random() < exploration_rate(e) : 
            action = env.action_space.sample() # explore 
         
        # increment enviroment
        obs, reward, done, _ = env.step(action)
        new_state = discretizer(*obs)
        
        # Update Q-Table
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward , new_state )
        old_value = Q[current_state][action]
        Q[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state
        
        # Render the cartpole environment
        #env.render()



    

Episode 0 Average time 0.03360300064086914
Episode 10 Average time 0.04330003261566162
Episode 20 Average time 0.0799968957901001
Episode 30 Average time 0.06669747829437256
Episode 40 Average time 0.045004081726074216
Episode 50 Average time 0.036699867248535155
Episode 60 Average time 0.04989752769470215
Episode 70 Average time 0.041599822044372556
Episode 80 Average time 0.11840009689331055
Episode 90 Average time 0.038300704956054685
Episode 100 Average time 0.04000000953674317
Episode 110 Average time 0.036601018905639646
Episode 120 Average time 0.03000009059906006
Episode 130 Average time 0.033298373222351074
Episode 140 Average time 0.058300137519836426
Episode 150 Average time 0.17520005702972413


KeyboardInterrupt: 

In [None]:
n_episodes = 100 
for e in range(n_episodes):
    
    # Siscretize state into buckets
    current_state, done = discretizer(*env.reset()), False

    episode_time = 0
    start_time = time.time()
    
    while done==False:
        
        # policy action 
        action = policy(current_state) # exploit
        
        # insert random action
        if np.random.random() < exploration_rate(e) : 
            action = env.action_space.sample() # explore 
         
        # increment enviroment
        obs, reward, done, _ = env.step(action)
        new_state = discretizer(*obs)
        
        # Update Q-Table
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward , new_state )
        old_value = Q[current_state][action]
        Q[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state
        
        # Render the cartpole environment
        env.render()
          
    
    if(done):
        episode_time += (time.time()-start_time)  
    if((e%10) == 0 & done):
        print(f"Episode {e} Average time {episode_time/10}")


In [None]:
env.close()