In [1]:
# !pip install gym
# !pip install pygame
# ! pip install matplotlib


# https://www.gymlibrary.ml/environments/classic_control/cart_pole/?highlight=cart+pole

In [2]:
import gym
import numpy as np 
import matplotlib.pyplot as plt
import time

In [3]:
env = gym.make('CartPole-v1')
print(env.observation_space.low,"\n",env.observation_space.high)

[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] 
 [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [4]:
action_space = 2 # number of possible actions
bin_size = 15

overe0 = float(4)
overe1 = float(4)/2**1
overe2 = float(4)/2**2
overe3 = float(4)/2**3
overe4 = float(4)/2**4
overe5 = float(4)/2**5
overe6 = float(4)/2**6

lin_space = [-overe0, -overe1, -overe2, -overe3, -overe4, -overe5, -overe6, 0, overe6, overe5, overe4, overe3, overe2, overe1, overe0]

def Qtable(state_space,action_space,bin_size=15 ):
    bins = [np.linspace(-2.4,2.4,bin_size),
            lin_space,
            np.linspace(-0.2095,0.2095,bin_size),
            lin_space]
   
    q_table = np.random.uniform(low=-1,high=1,size=([len(bins[0]), len(bins[1]), len(bins[2]), len(bins[3])] + [action_space]))
    return q_table, bins

def Discrete(state, bins):
    index = []
    for i in range(len(state)): index.append(np.digitize(state[i],bins[i]) - 1)
    return tuple(index)

In [5]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
   
    return action

In [6]:
def plotLearning(data, episodes, timestep):
    ep = [i for i in range(0,episodes + 1,timestep)]
    # if len(ep) == len(data['max']):
    print (ep)
    print(data['max'])
    plt.plot(ep, data['max'], label = 'Max')
    plt.plot(ep, data['avg'], label = 'Avg')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.legend(loc = "upper left") 

In [7]:
def updateQtable(action, reward, current_state,next_state, gamma, lr):
    max_future_q = np.max(q_table[next_state])
    current_q = q_table[current_state+(action,)]
    new_q = (1-lr)*current_q + lr*(reward + gamma*max_future_q)
    q_table[current_state+(action,)] = new_q

In [14]:
    
def Q_learning(q_table, bins, episodes = 5000, gamma = 0.95, lr = 0.1, timestep = 5000, epsilon = 0.05):
    rewards = 0
    runs = [0]
    data = {'max' : [0], 'avg' : [0]}
    solved = {'episodes' : [], 'timeMinutes': []}
    timesWon = 0 
    episodes_to_solve = 0
    process_time = time.time()

    for episode in range(1,episodes+1):
        current_state = Discrete(env.reset(),bins) # initial observation
        score = 0
        done = False
        
        while not done:
            ep_start = time.time()
            # if timesWon > 0:
            #     env.render()
    
            action = epsilon_greedy_policy(current_state, q_table, epsilon)
            obs, reward, done, info = env.step(action)
            next_state = Discrete(obs,bins)
            score += reward
            
            updateQtable(action, reward, current_state,next_state, gamma, lr)
            current_state = next_state
            
        # End of the loop update
        else:
            if score >= 500:
                epsilon = 0.01
            else:
                epsilon = 0.05
            env.reset()
            rewards += score
            runs.append(score)
        
        # Timestep value update
        if episode%timestep == 0:
            print('Episode : {} | Avg. Rewards -> {} | Max reward : {} | Time : {}'.format(episode,rewards/timestep, max(runs), time.time() - ep_start))
            data['max'].append(max(runs))
            data['avg'].append(rewards/timestep)
            episodes_to_solve = episode
            if rewards/timestep >= 475:
                timesWon += 1 
                print('Solved in episode : {}'.format(episode))
                solved['episodes'].append(episode)
                # obtained minutes elapsed between procces_time and time.time()
                solved['timeMinutes'].append(round((time.time() - process_time)/60,2))
                solved['timeMinutes'].append(minutes)
                epsilon = 0

            rewards, runs= 0, [0]
            
        if timesWon == 5:
            episodes_to_solve = episode
            break

    plotLearning(data, episodes_to_solve, timestep)
    print('Solved ' + str(timesWon) + ' times in ' + str(solved))
    env.close()

In [15]:
# TRANING
q_table, bins = Qtable(len(env.observation_space.low), env.action_space.n)
Q_learning(q_table, bins, lr = 0.14, gamma = 0.995, episodes = 2*10**5, timestep = 1000, epsilon = 0.06)

Episode : 1000 | Avg. Rewards -> 28.61 | Max reward : 131.0 | Time : 0.0001342296600341797
Episode : 2000 | Avg. Rewards -> 28.672 | Max reward : 118.0 | Time : 0.00013208389282226562
Episode : 3000 | Avg. Rewards -> 36.627 | Max reward : 188.0 | Time : 0.00015807151794433594
Episode : 4000 | Avg. Rewards -> 46.555 | Max reward : 229.0 | Time : 0.00013518333435058594
Episode : 5000 | Avg. Rewards -> 59.775 | Max reward : 367.0 | Time : 0.00014138221740722656
Episode : 6000 | Avg. Rewards -> 95.93 | Max reward : 479.0 | Time : 0.00011110305786132812
Episode : 7000 | Avg. Rewards -> 137.86 | Max reward : 500.0 | Time : 0.00013566017150878906
Episode : 8000 | Avg. Rewards -> 209.804 | Max reward : 500.0 | Time : 0.00017571449279785156
Episode : 9000 | Avg. Rewards -> 231.278 | Max reward : 500.0 | Time : 0.00014662742614746094
Episode : 10000 | Avg. Rewards -> 283.422 | Max reward : 500.0 | Time : 9.202957153320312e-05
Episode : 11000 | Avg. Rewards -> 288.543 | Max reward : 500.0 | Time 

AttributeError: 'float' object has no attribute 'total_seconds'