In [11]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np 
import time, math, random
from typing import Tuple

# import gym 
import gym

In [12]:
env = gym.make('CartPole-v0')

In [13]:
policy = lambda _,__,___, tip_velocity : int( tip_velocity > 0 )

In [14]:
n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]

def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    """Convert continues state intro a discrete state"""
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [15]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 2)

In [16]:
def policy( state : tuple ):
    """Choosing action based on epsilon-greedy policy"""
    return np.argmax(Q_table[state])

In [17]:
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    """Temperal diffrence for updating Q-value of state-action pair"""
    future_optimal_value = np.max(Q_table[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [18]:
# Adaptive learning of Learning Rate
def learning_rate(n : int , min_rate=0.01 ) -> float  :
    """Decaying learning rate"""
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

In [19]:
def exploration_rate(n : int, min_rate= 0.1 ) -> float :
    """Decaying exploration rate"""
    return max(min_rate, min(1, 1.0 - math.log10((n  + 1) / 25)))

In [20]:
n_episodes = 100
n_steps = 10
for i in range (n_episodes):
    for e in range(n_steps):
        # Siscretize state into buckets
        current_state, done = discretizer(*env.reset()), False
        episode_time_total = 0
        start_time = time.time()
    
        # policy action 
        while done==False:
            action = policy(current_state) # exploit

            # insert random action
            if np.random.random() < exploration_rate(e) : 
                action = env.action_space.sample() # explore 

            # increment enviroment
            obs, reward, done, __ = env.step(action)
            new_state = discretizer(*obs)

            # Update Q-Table
            lr = learning_rate(e)
            learnt_value = new_Q_value(reward , new_state )
            old_value = Q_table[current_state][action]
            Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value

            current_state = new_state

            # Render the cartpole environment
            env.render()

        episode_time_total += (time.time()-start_time)
    steps_time_average = episode_time_total/n_steps    
    print(f"Episode: {i+1} Time: {steps_time_average}")

Episode: 1 Time: 0.03680012226104736
Episode: 2 Time: 0.02330038547515869
Episode: 3 Time: 0.03650012016296387
Episode: 4 Time: 0.028299999237060548
Episode: 5 Time: 0.02649989128112793
Episode: 6 Time: 0.014899682998657227
Episode: 7 Time: 0.030000019073486327
Episode: 8 Time: 0.023799991607666014
Episode: 9 Time: 0.07169952392578124
Episode: 10 Time: 0.031599569320678714
Episode: 11 Time: 0.026599955558776856
Episode: 12 Time: 0.04330039024353027
Episode: 13 Time: 0.026599931716918945
Episode: 14 Time: 0.02160184383392334
Episode: 15 Time: 0.029900145530700684
Episode: 16 Time: 0.018199753761291505
Episode: 17 Time: 0.02319467067718506
Episode: 18 Time: 0.04679961204528808
Episode: 19 Time: 0.06689972877502441
Episode: 20 Time: 0.026499843597412108
Episode: 21 Time: 0.06489968299865723
Episode: 22 Time: 0.025098133087158202
Episode: 23 Time: 0.08169996738433838
Episode: 24 Time: 0.05209977626800537
Episode: 25 Time: 0.03670015335083008
Episode: 26 Time: 0.061499810218811034
Episode: 

In [21]:
env.close()