<a href="https://colab.research.google.com/github/BlackSparrow-43/deep-rl-class/blob/main/My_Projects/Q-Learning/CartPole/Cartpole_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install gym[all]  -q

import time  
import gym
import numpy as np  

In [21]:
from gym.envs.registration import register

register(
    id='CartPole-v1',
    entry_point='gym.envs.classic_control:CartPoleEnv',
    max_episode_steps=100000,
    reward_threshold=195.0,
)

In [22]:
env = gym.make("CartPole-v1")
env.reset()
obs_cart_vel=[]
obs_pole_vel=[]
for i in range(10):
    action = env.action_space.sample()
    obs,reward,done,info = env.step(action)
    time.sleep(1)
    obs_cart_vel.append(obs[1])
    obs_pole_vel.append(obs[3])
    if done == True:
        break
env.close()

In [23]:
obs_cart_vel_max = 5
obs_cart_vel_min = -5
obs_cart_pole_max = 5
obs_cart_pole_min = -5

In [24]:
def bin_creation(no_of_bins):
    bin_cart_pos = np.linspace(-2.4,2.4,no_of_bins)
    bin_cart_vel = np.linspace(obs_cart_vel_min,obs_cart_vel_max,no_of_bins)
    bin_pole_pos = np.linspace(-0.10472,0.10472,no_of_bins)
    bin_pole_vel = np.linspace(obs_cart_pole_max,obs_cart_pole_min,no_of_bins)
    bins = np.array([bin_cart_pos,bin_cart_vel,bin_pole_pos,bin_pole_vel])
    return bins

In [25]:
bins_no = 48
bins_all = bin_creation(bins_no)

In [26]:
bins_all

array([[-2.40000000e+00, -2.29787234e+00, -2.19574468e+00,
        -2.09361702e+00, -1.99148936e+00, -1.88936170e+00,
        -1.78723404e+00, -1.68510638e+00, -1.58297872e+00,
        -1.48085106e+00, -1.37872340e+00, -1.27659574e+00,
        -1.17446809e+00, -1.07234043e+00, -9.70212766e-01,
        -8.68085106e-01, -7.65957447e-01, -6.63829787e-01,
        -5.61702128e-01, -4.59574468e-01, -3.57446809e-01,
        -2.55319149e-01, -1.53191489e-01, -5.10638298e-02,
         5.10638298e-02,  1.53191489e-01,  2.55319149e-01,
         3.57446809e-01,  4.59574468e-01,  5.61702128e-01,
         6.63829787e-01,  7.65957447e-01,  8.68085106e-01,
         9.70212766e-01,  1.07234043e+00,  1.17446809e+00,
         1.27659574e+00,  1.37872340e+00,  1.48085106e+00,
         1.58297872e+00,  1.68510638e+00,  1.78723404e+00,
         1.88936170e+00,  1.99148936e+00,  2.09361702e+00,
         2.19574468e+00,  2.29787234e+00,  2.40000000e+00],
       [-5.00000000e+00, -4.78723404e+00, -4.57446809e+

In [27]:
def cont_to_dis(observation,bins):
    digitised_obs = []
    for i,obs in enumerate(observation):
        digitised_obs.append((np.digitize(obs,bins_all[i]))-1)
    return tuple(digitised_obs)

In [28]:
env = gym.make("CartPole-v1")
q_table_shape = (bins_no,bins_no,bins_no,bins_no,env.action_space.n)
q_table = np.zeros(q_table_shape)

In [29]:
q_table_shape

(48, 48, 48, 48, 2)

In [30]:
epoch = 50000
alpha = 0.8
gamma = .95
epsilon = 1
max_epsilon = 1
min_epsilon = .01
epsilon_end = 10000
decay_rate = .0001

In [31]:
def epsilon_update_linear(epsilon,epoch):
    if max_epsilon <= epoch <= epsilon_end:
        epsilon -=decay_rate
    return epsilon

In [32]:
def epsilon_update_greedy(Gen):
    return (min_epsilon+(max_epsilon-min_epsilon)*np.exp((-decay_rate)*Gen))

In [33]:
def epsilon_greedy(epsilon,q_table,state):
    random_no = np.random.random()
    if random_no > epsilon:
        action = np.argmax(q_table[state])
        select="from_table" 
    else:
        action = env.action_space.sample()
        select="random"
    return action,select

In [34]:
def new_q_value_system(old_q_value,reward,next_q_value):
    return old_q_value + alpha*(reward + gamma*(next_q_value - old_q_value))

In [35]:
def reward_system(points,reward_obs,discreted_obs,done):
    
    angle = discreted_obs[2]
    pos = discreted_obs[0]
    reward_step = reward_ang = reward_pos = 0   
    
    if 1 <= angle <= 3:
        reward_ang = -60
    elif 3 <= angle <= 5:
        reward_ang = -48
    elif 5 <= angle <= 7:
        reward_ang = -36
    elif 7 <= angle <= 9:
        reward_ang = -24
    elif 9 <= angle <= 11:
        reward_ang = -16
    elif 11 <= angle <= 13:
        reward_ang = -8
    elif 13 <= angle <= 15:
        reward_ang = -4
    elif 15 <= angle <= 17:
        reward_ang = -2
    elif 17 <= angle <= 19:
        reward_ang = 0
    elif 19 <= angle <= 21:
        reward_ang = 5
    elif 21 <= angle <= 23:
        reward_ang = 10
    elif 23 <= angle <= 25:
        reward_ang = 5
    elif 25 <= angle <= 27:
        reward_ang = 0
    elif 27 <= angle <= 29:
        reward_ang = -2
    elif 29 <= angle <= 31:
        reward_ang = -4
    elif 31 <= angle <= 33:
        reward_ang = -8
    elif 33 <= angle <= 35:
        reward_ang = -12
    elif 35 <= angle <= 37:
        reward_ang = -18
    elif 37 <= angle <= 39:
        reward_ang = -26
    elif 39 <= angle <= 41:
        reward_ang = -36
    elif 41 <= angle <= 43:
        reward_ang = -44
    elif 43 <= angle <= 45:
        reward_ang = -56
    elif 45 <= angle <= 47:
        reward_ang = -64
    else:
        reward_ang = -100


    if 0 <= pos < 4:
        reward_pos = -80
    elif 4 <= pos < 8:
        reward_pos = -40
    elif 8 <= pos < 12:
        reward_pos = -10
    elif 12 <= pos < 16:
        reward_pos = 0
    elif 16 <= pos < 20:
        reward_pos = 2
    elif 20 <= pos < 24:
        reward_pos = 5
    elif 24 <= pos < 28:
        reward_pos = 5
    elif 28 <= pos < 32:
        reward_pos = 2
    elif 32 <= pos < 36:
        reward_pos = 0
    elif 36 <= pos < 40:
        reward_pos = -10
    elif 40 <= pos < 44:
        reward_pos = -40
    elif 44 <= pos < 48:
        reward_pos = -80



    
    if done and points < 300:
        reward_step = -300
    
    return reward_ang + reward_pos + reward_obs + 1 

In [36]:
#q_table = np.load("Cartpole_q-table.npy")

In [None]:

env = gym.make("CartPole-v1")
rewards_interval = 0
rewards = []
log_interval = 1000
gen = 0
table_nos =0
random_nos = 0
total_rewards = 0
total_steps = 0

for Gen in range(epoch):
    
  state = env.reset()
  discreted_obs = cont_to_dis(state,bins_all)
  done = False
  points = 0
  steps = 0
  
  while not done:
    steps += 1 
    action,select = epsilon_greedy(epsilon,q_table,discreted_obs)
    next_state,reward,done,info = env.step(action)
    next_discreted_obs =  cont_to_dis(next_state,bins_all)
      
    old_q_value = q_table[discreted_obs+(action,)]
    next_q_estim_value = np.max(q_table[next_discreted_obs])
    reward = reward_system(points, reward, next_discreted_obs, done)
    total_rewards += reward
    new_q_value = new_q_value_system(old_q_value, reward, next_q_estim_value)
      
    q_table[discreted_obs+(action,)] = new_q_value
      
    discreted_obs = next_discreted_obs
    points += 1
    if select == "from_table":
      table_nos+=1
    elif select == "random":
      random_nos+=1

  total_steps += steps 
  epsilon = epsilon_update_greedy(Gen)
  rewards.append(total_rewards)
  rewards_interval = rewards_interval + total_rewards
  
  
  if gen%log_interval == 0:
    table_per = 100 *(table_nos / (table_nos + random_nos))
    random_per = 100 *(random_nos / (table_nos + random_nos))
    print("Gen="+str(Gen),"table_choice="+str(int(table_per)),"random_choice="+str(int(random_per)),"Last_epsisode_steps="+str(steps),"Interval_steps="+str(total_steps),"total="+str(rewards_interval),end=" ")
    rewards_interval = 0
    print("sum="+str(np.sum(rewards)),"LearningRate="+str(epsilon))
  gen = gen+1 
print("Gen="+str(Gen),"table_choice="+str(int(table_per)),"random_choice="+str(int(random_per)),"Last_epsisode_steps="+str(steps),"Interval_steps="+str(total_steps),"total="+str(rewards_interval),end=" ")
rewards_interval = 0
print("sum="+str(np.sum(rewards)),"LearningRate="+str(epsilon))
env.close()

In [None]:
images = []  
def show_render_4(env):
  time.sleep(.1)
  img = env.render(mode='rgb_array')
  images.append(img)

In [None]:

rewards = 0
points = 0 
steps = 0
observation = env.reset()
show_render_4(env)
while True:
    steps += 1
    show_render_4(env)
    discreted_obs = cont_to_dis(observation,bins_all)  # get bins
    action = np.argmax(q_table[discreted_obs])  # and chose action from the Q-Table
    observation, reward, done, info = env.step(action) # Finally perform the action
    points = points+1
    temp=reward_system(points,reward,discreted_obs,done)
    rewards += temp
    print("total_reward=",rewards)
    if done:
        break
env.close()
print("Steps="+str(steps),"Reward="+str(points))

In [None]:
print("Steps="+str(steps),"Reward="+str(points))

In [None]:
!pip install imageio imageio_ffmpeg -q

import imageio
imageio.mimsave("cartpole.mp4", [np.array(img) for i, img in enumerate(images)], fps=25)

In [None]:
np.save("Cartpole_q-table",q_table)

In [None]:
%%capture
!pip install pyglet==1.5.1 
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()