In [None]:
# !pip install gym
# !pip install pygame

In [None]:
# ! pip install matplotlib

In [1]:
import gym
import numpy as np 
import matplotlib.pyplot as plt
import time

In [2]:
env = gym.make('CartPole-v1')
print(env.observation_space.low,"\n",env.observation_space.high)

[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] 
 [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [3]:
state_space = 4 # number of states
action_space = 2 # number of possible actions
infinity_linspace_limit = 4


def Qtable(state_space,action_space,bin_size = 30):
    
    bins = [np.linspace(-4.8,4.8,bin_size),
            np.linspace(-infinity_linspace_limit,infinity_linspace_limit,bin_size),
            np.linspace(-0.418,0.418,bin_size),
            np.linspace(-infinity_linspace_limit,infinity_linspace_limit,bin_size)]
    
    q_table = np.random.uniform(low=-1,high=1,size=([bin_size] * state_space + [action_space]))
    return q_table, bins

def Discrete(state, bins):
    index = []
    for i in range(len(state)): index.append(np.digitize(state[i],bins[i]) - 1)
    return tuple(index)

In [4]:
def Q_learning(q_table, bins, episodes = 5000, gamma = 0.95, lr = 0.1, timestep = 5000, epsilon = 0.2):
    rewards = 0
    solved = False 
    steps = 0 
    runs = [0]
    data = {'max' : [0], 'avg' : [0]}
    start = time.time()
    ep = [i for i in range(0,episodes + 1,timestep)] 

      
    
    for episode in range(1,episodes+1):
        
        current_state = Discrete(env.reset(),bins) # initial observation
        score = 0
        done = False
        temp_start = time.time()
        
        while not done:
            steps += 1 
            ep_start = time.time()
            # if episode%timestep == 0:
            #     env.render()
                
            if np.random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[current_state])
            
            observation, reward, done, info = env.step(action)
            next_state = Discrete(observation,bins)

            score += reward
            

            if not done:
                max_future_q = np.max(q_table[next_state])
                current_q = q_table[current_state+(action,)]
                new_q = (1-lr)*current_q + lr*(reward + gamma*max_future_q)
                q_table[current_state+(action,)] = new_q

            current_state = next_state
            
        # End of the loop update
        else:
            rewards += score
            runs.append(score)
            #if score > 475 and steps >= 100 and solved == False: # considered as a solved:
            if score > 475 and solved == False: # considered as a solved:
                solved = True
                print('Solved in episode : {} in time {}'.format(episode, (time.time()-ep_start)))
        
        # Timestep value update
        if episode%timestep == 0:
            print('Episode : {} | Reward -> {} | Max reward : {} | Time : {}'.format(episode,rewards/timestep, max(runs), time.time() - ep_start))
            data['max'].append(max(runs))
            data['avg'].append(rewards/timestep)
            if rewards/timestep >= 475: 
                print('Solved in episode : {}'.format(episode))
            rewards, runs= 0, [0] 
            
    if len(ep) == len(data['max']):
        plt.plot(ep, data['max'], label = 'Max')
        plt.plot(ep, data['avg'], label = 'Avg')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.legend(loc = "upper left")
        
    env.close()

In [5]:
# TRANING
q_table, bins = Qtable(len(env.observation_space.low), env.action_space.n)

Q_learning(q_table, bins, lr = 0.15, gamma = 0.995, episodes = 5*10**5, timestep = 1000)



Episode : 1000 | Reward -> 91.802 | Max reward : 366.0 | Time : 6.461143493652344e-05
Episode : 2000 | Reward -> 122.973 | Max reward : 437.0 | Time : 6.842613220214844e-05
Solved in episode : 2968 in time 4.2438507080078125e-05
Episode : 3000 | Reward -> 130.081 | Max reward : 499.0 | Time : 7.581710815429688e-05
Episode : 4000 | Reward -> 142.419 | Max reward : 500.0 | Time : 5.7220458984375e-05
Episode : 5000 | Reward -> 167.913 | Max reward : 500.0 | Time : 5.7220458984375e-05
Episode : 6000 | Reward -> 200.252 | Max reward : 500.0 | Time : 6.461143493652344e-05
Episode : 7000 | Reward -> 246.518 | Max reward : 500.0 | Time : 7.319450378417969e-05
Episode : 8000 | Reward -> 290.174 | Max reward : 500.0 | Time : 7.05718994140625e-05
Episode : 9000 | Reward -> 303.829 | Max reward : 500.0 | Time : 7.295608520507812e-05
Episode : 10000 | Reward -> 274.817 | Max reward : 500.0 | Time : 6.461143493652344e-05
Episode : 11000 | Reward -> 332.725 | Max reward : 500.0 | Time : 6.72340393066