In [1]:
import gym
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import math, random
from typing import Tuple
import matplotlib.pyplot as plt
import time

In [2]:
env = gym.make('CartPole-v0')

In [3]:
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]

In [4]:
#observation går fra 0-3
#0 cart position
#1 cart velocity
#2 pole angle
#3 pole velocity at tip

In [5]:
n_bins = (6,12)

#upper og lower bounds
upper_bounds = [
    env.observation_space.high[2], math.radians(50)
    ]
lower_bounds = [
    env.observation_space.low[2], -math.radians(50)
    ]
Q = np.zeros(n_bins + (n_actions,))

In [6]:
def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [7]:
n_episodes = 100
n_steps = 10
ada_divisor = 25

In [8]:
def policy (state : tuple):
    return np.argmax(Q[state])


In [9]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    future_optimal_value = np.max(Q[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [10]:
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / ada_divisor)))

def exploration_rate(n : int, min_rate= 0.1 ) -> float :
    return max(min_rate, min(1, 1.0 - math.log10((n  + 1) / ada_divisor)))

In [11]:

for i in range (n_episodes):
    for e in range(n_steps):
        # Siscretize state into buckets
        current_state, done = discretizer(*env.reset()), False
        episode_time_total = 0
        start_time = time.time()
    
        # policy action 
        while done==False:
            action = policy(current_state) # exploit

            # insert random action
            if np.random.random() < exploration_rate(e) : 
                action = env.action_space.sample() # explore 

            # increment enviroment
            obs, reward, done, __ = env.step(action)
            new_state = discretizer(*obs)

            # Update Q-Table
            lr = learning_rate(e)
            learnt_value = new_Q_value(reward , new_state )
            old_value = Q[current_state][action]
            Q[current_state][action] = (1-lr)*old_value + lr*learnt_value

            current_state = new_state

            # Render the cartpole environment
            env.render()

        episode_time_total += (time.time()-start_time)
    steps_time_average = episode_time_total/n_steps    
    print(f"Episode: {i+1} Time: {steps_time_average}")
    

Episode: 0 Time: 0.03837025165557861
Episode: 1 Time: 0.11340823173522949
Episode: 2 Time: 0.03163573741912842
Episode: 3 Time: 0.04171321392059326
Episode: 4 Time: 0.06335659027099609
Episode: 5 Time: 0.020002198219299317
Episode: 6 Time: 0.08842146396636963
Episode: 7 Time: 0.07506017684936524
Episode: 8 Time: 0.03841292858123779
Episode: 9 Time: 0.07151436805725098
Episode: 10 Time: 0.021608424186706544
Episode: 11 Time: 0.01829988956451416
Episode: 12 Time: 0.03159120082855225
Episode: 13 Time: 0.03170020580291748
Episode: 14 Time: 0.04006791114807129
Episode: 15 Time: 0.024875450134277343
Episode: 16 Time: 0.03489968776702881
Episode: 17 Time: 0.04519529342651367
Episode: 18 Time: 0.025100016593933107
Episode: 19 Time: 0.02989962100982666
Episode: 20 Time: 0.02821199893951416
Episode: 21 Time: 0.040077590942382814
Episode: 22 Time: 0.01979959011077881
Episode: 23 Time: 0.028222250938415527
Episode: 24 Time: 0.02167985439300537
Episode: 25 Time: 0.020053291320800783
Episode: 26 Tim

In [12]:
env.close()