In [12]:
import gym
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import math, random
from typing import Tuple
import matplotlib.pyplot as plt
import time

In [13]:
env = gym.make('CartPole-v0')

In [14]:
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]

In [15]:
#observation går fra 0-3
#0 cart position
#1 cart velocity
#2 pole angle
#3 pole velocity at tip

In [16]:
n_bins = (6,12)

#upper og lower bounds
upper_bounds = [
    env.observation_space.high[2], math.radians(50)
    ]
lower_bounds = [
    env.observation_space.low[2], -math.radians(50)
    ]
Q = np.zeros(n_bins + (n_actions,))

In [17]:
def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [18]:
def policy (state : tuple):
    return np.argmax(Q[state])

In [19]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    future_optimal_value = np.max(Q[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [20]:
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

def exploration_rate(n : int, min_rate=0.15 ) -> float :
    return max(min_rate, min(1.0, 1.0 - math.log10((n  + 1) / 25)))

In [21]:
n_episodes = 300 
for e in range(n_episodes):
    
    # Siscretize state into buckets
    current_state, done = discretizer(*env.reset()), False

    episode_time = 0
    start_time = time.time()
    
    while done==False:
        
        # policy action 
        action = policy(current_state) # exploit
        
        # insert random action
        if np.random.random() < exploration_rate(e) : 
            action = env.action_space.sample() # explore 
         
        # increment enviroment
        obs, reward, done, _ = env.step(action)
        new_state = discretizer(*obs)
        
        # Update Q-Table
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward , new_state )
        old_value = Q[current_state][action]
        Q[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state
        
        # Render the cartpole environment
        env.render()
          
    
    if(done):
        episode_time += (time.time()-start_time)  
    if((e%10) == 0 & done):
        print(f"Episode {e} Average time {episode_time/10}")



    

Episode 0 Average time 0.02650015354156494
Episode 10 Average time 0.026600122451782227
Episode 20 Average time 0.08319942951202393
Episode 30 Average time 0.023296928405761717
Episode 40 Average time 0.031603527069091794
Episode 50 Average time 0.030000519752502442
Episode 60 Average time 0.03990323543548584
Episode 70 Average time 0.03329977989196777
Episode 80 Average time 0.1468001127243042
Episode 90 Average time 0.029938125610351564
Episode 100 Average time 0.04539999961853027
Episode 110 Average time 0.06499791145324707
Episode 120 Average time 0.02830023765563965
Episode 130 Average time 0.10654630661010742
Episode 140 Average time 0.0433002233505249
Episode 150 Average time 0.23860445022583007
Episode 160 Average time 0.22839994430541993
Episode 170 Average time 0.3119048595428467
Episode 180 Average time 0.27689993381500244
Episode 190 Average time 0.218403959274292
Episode 200 Average time 0.21519882678985597
Episode 210 Average time 0.05499997138977051
Episode 220 Average t

In [22]:
env.close()