In [1]:
import numpy as np
from PIL import Image

In [2]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import argparse
import time
import numpy as np
import collections

from tensorboardX import SummaryWriter
from IPython.display import clear_output

In [3]:
import gym

env = gym.make("FetchReach-v1")
DEFAULT_ENV_NAME="FetchReach-v1"

  result = entry_point.load(False)


In [4]:
print(env.action_space)
print(env.action_space.sample())

Box(4,)
[0.09762701 0.43037874 0.20552675 0.08976637]


In [5]:
print(env.observation_space)

Dict(achieved_goal:Box(3,), desired_goal:Box(3,), observation:Box(10,))


In [6]:
class DQN(nn.Module):

    def __init__(self):
        super(DQN, self).__init__()
        # 4 input image channel, 32 output channels, 3x3 square convolution
        # kernel
        self.pipe = nn.Sequential(
            nn.Linear(16,128),
            nn.ReLU(),
            nn.Linear(128,256),
            nn.ReLU(),
            nn.Linear(256,4)
        )

    def forward(self, x):  
        x= x.float()
        return self.pipe(x)


In [7]:
GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
REPLAY_START_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000

In [8]:
#epsilon decay, for choosing random actions to explore space
EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02
#Going from 1 to .02 in 100000 frames
MEAN_REWARD_BOUND=1000

In [9]:
#define what experience is
Experience = collections.namedtuple("Experience", 
                        field_names = ['state','action','reward', 'done','new_state'])

class ExperienceBuffer:
     #This acts as our replay buffer
    def __init__(self,capacity):
        self.buffer = collections.deque(maxlen=capacity) #rotating buffer
    
    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
        
    def sample(self,batch_size):
        indices = np.random.choice(
            len(self.buffer), batch_size, replace =False)
        
        states,actions,rewards,dones, next_states = \
            zip(*[self.buffer[idx] for idx in indices])
        
        states,actions = np.array(states),np.array(actions)
        rewards = np.array(rewards,dtype=np.float32)
        dones =np.array(dones, dtype=np.float32)
        return states,actions,rewards,dones,next_states

In [10]:
#make our agent that acts and learns in the environment
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()
        
    def _reset(self):
        state = env.reset()
        
        
        self.state = np.concatenate(list(state.values()))
        self.total_reward = 0.0
    
    def play_step(self, net, epsilong=0.0, device ='cpu'):
        done_reward = None
        
        state_a = np.array([self.state], copy=False)
        state_v = torch.tensor(state_a).to(device)
            
        
        if np.random.random() < epsilon:
            action = env.action_space.sample()
            #print("random action:",action)
            
        else:
            q_vals_v =  net(state_v)
            #_, act_v = torch.max(q_vals_v,dim=1)
            
            
            #action = int(act_v.item())
            action = q_vals_v[0].detach().numpy()
            #print("out action:",action)
            
            
        env.render()
            
        #take the action and update
        new_state, reward, is_done, _ = self.env.step(action)
        new_state = np.concatenate(list(new_state.values()))

        #calcululate how much closer to the center you are
        new_state_torch =torch.tensor(new_state)
        #print("state_v:",state_v)
        #print("new_state:",new_state_torch.shape)
        old_distance = torch.dist(state_v[10:13],state_v[13:16])
        new_distance = torch.dist(new_state_torch[10:13],new_state_torch[13:16])

        #if new distance is smaller we get a positive reward
        reward = old_distance-new_distance 

        self.total_reward += reward
        #do we need this? is it doing a copy?
        #new_state = new_state

        exp = Experience(self.state,action,reward,is_done,new_state)

        self.exp_buffer.append(exp)
        self.state = new_state

        if(is_done):
            done_reward = self.total_reward
            self._reset()
        return done_reward
        
def calc_loss(batch, net, target_net, device='cpu'):
    states,actions,rewards,dones,next_states = batch
    
    
    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    #predictions
    #look into gather further
    state_action_values = net(states_v).gather(1,actions_v.unsqueeze(-1)).squeeze(-1)

    #what is the value of being in the state that our action took us to?
    next_state_values = target_net(next_states_v).max(1)[0] #take the max along the first axis


    # saying the reward of the step after finishing is zero
    # required to converege
    next_state_values[done_mask] = 0.0

    #we do not want to perform backprop on this 
    next_state_values = next_state_values.detach() 

    #-------------Bellman equation------------------
    expected_state_action_values = next_state_values * GAMMA + rewards_v

    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [11]:
if __name__ == "__main__":
    #parser = argparse.ArgumentParser()
    #parser.add_argument("--cude", default=False, action='store_true', help="Enable cuda")
    
    #parser.add_argument("--env", default= DEFAULT_ENV_NAME, 
    #                    help="name of the environment, default="+DEFAULT_ENV_NAME)
    
    #parser.add_argument("--reward", type=float, default=MEAN_REWARD_BOUND, 
    #                    help="Mean reward boundary for stop of training, default= %.2f"% MEAN_REWARD_BOUND)
    
    #args = parser.parse_args()
    args= [False,DEFAULT_ENV_NAME,MEAN_REWARD_BOUND]
    #device= torch.device("cuda" if args.cuda else "cpu")
    device = 'cpu'
    
    env = gym.make(DEFAULT_ENV_NAME)#args.env)
    net = DQN().to(device)
    
    target_net = DQN().to(device)
    
    writer = SummaryWriter(comment='-'+DEFAULT_ENV_NAME)#args.env)
    print(net)
    
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    
    epsilon = EPSILON_START
    
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    
    best_mean_reward = None
    
    while True:
        frame_idx += 1
        epsilon = max(EPSILON_FINAL, EPSILON_START - (frame_idx / EPSILON_DECAY_LAST_FRAME))
        
            
        
        reward = agent.play_step(net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            
            mean_reward = np.mean(total_rewards[-100:])
            #print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" %(
            #        frame_idx, len(total_rewards), mean_reward, epsilon, speed
            #    ))
                
            
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)
            
            if best_mean_reward  is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), DEFAULT_ENV_NAME+"-best.dat")
                if(best_mean_reward is not None):
                    print('best mean reward updated %.3f -> %.3f, model saved'%
                             (best_mean_reward, mean_reward))
                    best_mean_reward = mean_reward
                if(mean_reward > MEAN_REWARD_BOUND):
                    print("solved in %d frames!" % frame_idx)
                    break
                
            if(len(buffer) < REPLAY_START_SIZE):
                continue
            
            if(frame_idx % SYNC_TARGET_FRAMES == 0):
                target_net.load_state_dict(net.state_dict())
            
            optimizer.zero_grad()
            batch = buffer.sample(BATCH_SIZE)
            loss_t = calc_loss(batch, net, target_net, device=device)
            
            loss_t.backward()
            optimizer.step()

DQN(
  (pipe): Sequential(
    (0): Linear(in_features=16, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=4, bias=True)
  )
)
Creating window glfw
Pressed ESC
Quitting.


NameError: name 'exit' is not defined