In [1]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_
import gym
import torch.nn as nn
import torch
from collections import deque
import json
import os
import pygame
import math
import random
import matplotlib.pyplot as plt
from gym.utils.play import play

  from pkg_resources import resource_stream, resource_exists
  logger.warn("Matplotlib is not installed, run `pip install gym[other]`")


In [2]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/cart_pole_dqn')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
class VfApproxModel(nn.Module):
    """
    Neural Network for Value Function Approximation\n
    Contains Three layers 4->30->30->2
    """
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=4,out_features=120)
        self.layer_2 = nn.Linear(in_features=120,out_features=120)
        self.layer_3 = nn.Linear(in_features=120,out_features=2) # 2 actions as output
        self.relu = nn.ReLU()
    
    def forward(self,features):
        out = self.relu(self.layer_1(features))
        out = self.relu(self.layer_2(out))
        out = self.relu(self.layer_3(out))
        # using softmax as action-selection policy
        # out = torch.softmax(out,-1)
        return out

In [5]:
target_policy = VfApproxModel().to(device)
learning_policy = VfApproxModel().to(device)

In [6]:
# Loading weights of learning policy
target_policy.load_state_dict(learning_policy.state_dict())

<All keys matched successfully>

In [7]:
def update_target_policy(TAU):
        """
        Soft update of the target network's weights\n
        θ′ ← τ θ + (1 −τ )θ
        """
        target_net_state_dict = target_policy.state_dict()
        learning_state_dict = learning_policy.state_dict()
        for key in learning_state_dict:
                target_net_state_dict[key] = learning_state_dict[key]*TAU + target_net_state_dict[key] *(1-TAU)
        
        target_policy.load_state_dict(target_net_state_dict)

In [8]:
EPISODES = 100*100
GAMMA = 0.99 # discount factor

ALPHA = 0.002 # Learning rate
EPSILON = 1 # e
EPSILON_DECAY = 0.0001
MIN_EXP_RATE = 0.15
MAX_EXP_RATE = 1

REPLAY_LENGTH = 1000
REPLAY_BATCH = 120
TAU = 0.09

In [9]:
criterion = nn.SmoothL1Loss().to(device)
opt = torch.optim.Adam(learning_policy.parameters(), lr=ALPHA)

In [10]:
def batch_state(replay_memory,batch_size):
    """
    Creates Batch of data\n
    (new_state[0],reward[1],running[2],state[3],action[4])
    """
    batch = random.sample(replay_memory,batch_size)
    new_states = []
    states = []
    rewards = []
    done = []
    actions = []
    
    for value in batch:
        new_states.append(value[0])
        rewards.append(value[1])
        done.append(value[2])
        states.append(value[3])
        actions.append(value[4])

    new_states = tuple(new_states)
    rewards = tuple(rewards)
    done = tuple(done)
    states = tuple(states)
    actions = tuple(actions)
    
    new_states = torch.cat(new_states)
    new_states = new_states.reshape(-1,4)

    states = torch.cat(states)
    states = states.reshape(-1,4)

    rewards = torch.cat(rewards)
    rewards = rewards.reshape(-1,1)

    done = torch.cat(done)
    done = done.reshape(-1,1)

    actions = torch.cat(actions)
    actions = actions.reshape(-1,1)
    

    return new_states,rewards,done,states,actions


In [11]:
def q_update(state,new_state,reward,done,actions):
    
    # current q values by action
    state_action_value = learning_policy(state).gather(1,actions)
    # max values (expected)
    with torch.no_grad():
        next_state_values = target_policy(new_state).max(1)[0].unsqueeze(1)
    expected_state_action_values = reward + GAMMA * next_state_values * done
    
    # expected_state_action_values = (next_state_values * GAMMA) * done + reward
    
    loss = criterion(state_action_value,expected_state_action_values)
    
    opt.zero_grad()
    loss.backward()
    # torch.nn.utils.clip_grad_value_(learning_policy.parameters(), 100)
    opt.step()

    return loss,expected_state_action_values,state_action_value

In [12]:
# new_states,rewards,done,states,actions = batch_state(mem,REPLAY_BATCH)
# loss,exp_s_a,s_a_v = q_update(states,new_states,rewards,done,actions)

In [13]:
# exp_s_a,s_a_v

In [14]:
class ReplayMemory:
    """
        Replay Memory for string Experience
    """
    def __init__(self,length,batch_size):
        self.replay_memory = deque(maxlen=length)
        self.batch_size = batch_size
    
    def add_experience(self,new_state,reward,running,state,action):
        """
            Adds Experience into replay_memory\n
            new_state and state both are torch tensors
        """
        self.replay_memory.append((new_state,reward,running,state,action))
    
    def sample(self,batch_size):
        pass
        
    
    def train_on_replay(self):
        """
            Training on Replay memory
        """
        batch = random.sample(self.replay_memory,self.batch_size)
        
        for new_state,reward,running,state,action in batch:

            loss = q_update(state,new_state,reward,running)
        
        return loss # final loss of replay batch


In [15]:
replay_memory = ReplayMemory(REPLAY_LENGTH,batch_size=REPLAY_BATCH)

In [16]:
state = torch.tensor([[0,0,4,0],[0,0,1,2],[1,2,3,4]],dtype=torch.float32,device=device)
state

tensor([[0., 0., 4., 0.],
        [0., 0., 1., 2.],
        [1., 2., 3., 4.]], device='cuda:0')

In [17]:
steps_done = 0
def select_action(state,env):
    global steps_done
    sample = random.random()
    eps_threshold = MIN_EXP_RATE + (EPSILON - MIN_EXP_RATE) * \
        math.exp(-1. * steps_done / EPSILON_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return learning_policy(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)


In [18]:
env = gym.make("CartPole-v1")
for episode in range(EPISODES):
    done = False
    state,info = env.reset()
    reward_per_episode = 0
    old_reward = 0
    state = torch.tensor(state,dtype=torch.float32,device=device).unsqueeze(0)
    print(f"-------Episode:[{episode+1}/{EPISODES}]--------")
    step = 1
    while not done:
        # action = learning_policy(state).max(1)[1].view(1, 1)
        action = select_action(state, env)
        
        new_state,reward,terminated,truncated,info = env.step(action.item())
        
        # if step % 10 == 0:
        #     reward += 1
        new_state = torch.tensor(new_state,dtype=torch.float32,device=device).unsqueeze(0)
        
        done = terminated or truncated
        # if done:
        #     new_state = torch.tensor([0,0,0,0],dtype=torch.float32,requires_grad=True,device=device).unsqueeze(0)
        running_data = torch.tensor([float(not done)], dtype=torch.float32, device=device)

        reward = torch.tensor([reward], dtype=torch.float32, device=device)
        action = torch.tensor([action],dtype=torch.int64,device=device)

        replay_memory.add_experience(new_state,reward,running_data,state,action)
        mem = replay_memory.replay_memory
        
        reward_per_episode += reward.item()
        state = new_state
        step +=1
        done = terminated or truncated
        
    if len(mem) > REPLAY_BATCH:
        # print("Training")
        new_states,rewards,done,states,actions = batch_state(mem,REPLAY_BATCH)
        loss,exp,sa = q_update(states,new_states,rewards,done,actions)
        
    # saving weights with high reward
    # print(reward_per_episode,old_reward)
    
    
    # q_updating after batch sampling

    print("Reward per Episode: ",reward_per_episode)
    writer.add_scalar("Exploration Rate",EPSILON,episode)
    EPSILON = (MAX_EXP_RATE-MIN_EXP_RATE) * np.exp(-EPSILON_DECAY*episode) + MIN_EXP_RATE
    writer.add_scalar("Reward per Episode",reward_per_episode,episode)
    if len(mem) > REPLAY_BATCH:
        writer.add_scalar("Loss",loss.item(),episode)
        print("Loss per Episode",loss.item())
    
    
    if episode % 30 == 0:
        print("[Updating Policy]")
        update_target_policy(TAU)
if reward_per_episode > old_reward:
    print("Saving Weights")
    torch.save(learning_policy.state_dict(),f"weights/weights_{reward_per_episode}.pth")
    old_reward = reward_per_episode
env.close()    

-------Episode:[1/10000]--------
Reward per Episode:  13.0
[Updating Policy]
-------Episode:[2/10000]--------
Reward per Episode:  14.0
-------Episode:[3/10000]--------
Reward per Episode:  9.0
-------Episode:[4/10000]--------
Reward per Episode:  11.0
-------Episode:[5/10000]--------
Reward per Episode:  9.0
-------Episode:[6/10000]--------
Reward per Episode:  9.0
-------Episode:[7/10000]--------
Reward per Episode:  11.0
-------Episode:[8/10000]--------
Reward per Episode:  10.0
-------Episode:[9/10000]--------
Reward per Episode:  17.0
-------Episode:[10/10000]--------
Reward per Episode:  10.0
-------Episode:[11/10000]--------
Reward per Episode:  14.0
Loss per Episode 0.5128738284111023
-------Episode:[12/10000]--------
Reward per Episode:  9.0
Loss per Episode 0.3739233911037445
-------Episode:[13/10000]--------
Reward per Episode:  9.0
Loss per Episode 0.2565479278564453
-------Episode:[14/10000]--------
Reward per Episode:  10.0
Loss per Episode 0.18680904805660248
-------Epis

In [19]:
torch.save(learning_policy.state_dict(),"weights_test.pth")

In [20]:
# exp_s_a,s_a_v

In [21]:
# learning_policy.load_state_dict(torch.load("weights/weights_200.pth"))

In [34]:
env = gym.make("CartPole-v1",render_mode="human")
done = False
state,info = env.reset()
reward_per_episode = 0
state = torch.tensor(state,dtype=torch.float32,requires_grad=True,device=device)
while not done:
    action = learning_policy(state).max(0)[1]
        
    new_state,reward,terminated,truncated,info = env.step(action.item())
    new_state = torch.tensor(new_state,dtype=torch.float32,requires_grad=True,device=device)
    done = terminated or truncated
    if done:
        print(new_state)
    reward_per_episode+=1
    state = new_state

print(reward_per_episode)
env.close()

tensor([-2.4387, -2.5317, -0.1940,  0.2438], device='cuda:0',
       requires_grad=True)
158


In [23]:
# exp_s_a,s_a_v

In [24]:
# state_action_value = learning_policy(state).max(0)[0]
# state_action_value 

In [25]:
# mem = replay_memory.replay_memory
# new_states,rewards,done,states,actions = batch_state(mem,120)
# print(new_states.shape)
# print(rewards.shape)
# print(done.shape)
# print(states.shape)
# print(actions.shape)

In [26]:
# states = learning_policy(states).gather(1,actions)
# with torch.no_grad():
#     next_state_values = target_policy(new_states).max(1)[0]
# next_state_values = next_state_values.reshape(-1,1)
# print(next_state_values.shape)
# expected_state_action_values = ((next_state_values * GAMMA) + rewards) * done
# print(rewards.shape)
# print(states.shape)
# print(expected_state_action_values.shape)

In [27]:
# loss = criterion(states,expected_state_action_values)
# print(loss)
# opt.zero_grad()
# loss.backward()
# opt.step()
# loss = criterion(states,expected_state_action_values)
# print(loss)

In [28]:
pygame.display.quit() 