In [1]:
%load_ext autoreload 
%autoreload 1

In [2]:
import functools
import gym
from Config import Config
# from util import train
from Models import ActorCritic
from Networks import cnn_head_model, actor_model, critic_model, head_model
from Memory import Memory
from baselines.common.cmd_util import make_env
from baselines.common.atari_wrappers import wrap_deepmind, make_atari


import matplotlib.pyplot as plt

env_id = "BreakoutNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=False)

# config = Config(gym.make('CartPole-v1'))
config = Config(env)

config.update_every = 500
config.num_learn = 4
config.win_condition = 230
config.n_episodes = 1000
config.max_t = 700

config.Memory = Memory
config.Model = ActorCritic
config.head_model = functools.partial(cnn_head_model, config)
# config.head_model = functools.partial(head_model, config)
config.actor_model = functools.partial(actor_model, config)
config.critic_model = functools.partial(critic_model, config)


In [3]:
import torch
import copy

env = copy.deepcopy(config.env)
state = env.reset()
print(torch.FloatTensor(state).shape)
# agent = PPO(config)

# action, log_prob = agent.act(torch.FloatTensor(state))
# print("[unsqueezed] Action space shape: {}".format(action.shape))
# print("[unsqueezed] Log Probabilities: {}".format(log_prob))

# action, log_prob = agent.act(torch.FloatTensor(state))
# print("[unsqueezed] Action space shape: {}".format(action.shape))
# print("[unsqueezed] Log Probabilities: {}".format(log_prob))

torch.Size([84, 84, 4])


In [6]:
import copy
import gym
import torch
import numpy as np
from collections import deque
from PPO import PPO
from Config import Config
import pdb

def get_state(obs):
    state = np.array(obs)
    state = state.transpose((2, 0, 1))
    state = torch.FloatTensor(state)
    return state.unsqueeze(0)

def get_save_state(obs):
    state = np.array(obs)
    state = state.transpose((2, 0, 1))
    state = torch.FloatTensor(state)
    return state

def train(config):
    env = copy.deepcopy(config.env)
    steps = 0
    scores_deque = deque(maxlen=100)
    scores = []
    average_scores = []
    max_score = -np.Inf

    agent = PPO(config)

    for i_episode in range(1, config.n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(config.max_t):
            steps += 1

            action, log_prob = agent.act(get_state(state))
            next_state, reward, done, _ = env.step(action)

            agent.mem.add(get_save_state(state), action, reward, log_prob, done)

            # Update 
            state = next_state
            score += reward


            if steps >= config.update_every:
                agent.learn(config.num_learn)
                agent.mem.clear()
                steps = 0

            if done:
                break 

        # Book Keeping
        scores_deque.append(score)
        scores.append(score)
        average_scores.append(np.mean(scores_deque))

        if i_episode % 10 == 0:
            print("\rEpisode {}	Average Score: {:.2f}	Score: {:.2f}".format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            print("\rEpisode {}	Average Score: {:.2f}".format(i_episode, np.mean(scores_deque)))   

        if np.mean(scores_deque) > config.win_condition:
            print("\nEnvironment Solved!")
            break

    return scores, average_scores


In [7]:
scores, average_score = train(config)
plt.plot(scores)
plt.plot(average_score)
plt.show()

Episode 10	Average Score: 0.10	Score: 0.00> [0;32m/Users/darylrodrigo/Documents/projects/rl_lib/PPO/PPO.py[0m(60)[0;36mlearn[0;34m()[0m
[0;32m     58 [0;31m[0;34m[0m[0m
[0m[0;32m     59 [0;31m      [0;31m# calculate advantage[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 60 [0;31m      [0madvantage[0m [0;34m=[0m [0mdiscounted_returns[0m [0;34m-[0m [0mvalues[0m[0;34m.[0m[0mcpu[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mdetach[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     61 [0;31m[0;34m[0m[0m
[0m[0;32m     62 [0;31m      [0;31m# calculate surrogates[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> actions.shape
torch.Size([500, 1])
ipdb> exit


BdbQuit: 

In [None]:
# def get_state(obs):
#     state = np.array(obs)
#     state = state.transpose((2, 0, 1))
#     state = torch.FloatTensor(state)
#     return state.unsqueeze(0)

# def get_state_save(obs):
#     state = np.array(obs)
#     state = state.transpose((2, 0, 1))
#     state = torch.FloatTensor(state)
#     return state

# env = copy.deepcopy(config.env)
# steps = 0
# scores_deque = deque(maxlen=100)
# scores = []
# average_scores = []
# max_score = -np.Inf

# agent = PPO(config)

# for i_episode in range(1, config.n_episodes+1):
#     state = env.reset()
#     score = 0
#     for t in range(config.max_t):
#         steps += 1

#         action, log_prob = agent.act(get_state(state))
# #             print("Action space shape: {}".format(action.shape))
# #             print("Action: {}".format(action))
# #             print("Log Probabilities: {}".format(log_prob))
# #             print("Action item: {}".format(action.item()))
#         next_state, reward, done, _ = env.step(action.item())

#         agent.mem.add(get_state_save(state), action, reward, log_prob, done)

#         # Update 
#         state = next_state
#         score += reward


#         if steps >= config.update_every:
#             break

In [None]:
# prev_states = torch.stack(agent.mem.states).to(agent.device).detach()
# prev_states.shape

In [None]:
# action, log_prob = agent.act(prev_states)
# action.shape

In [None]:
# state = env.reset()
# state = get_state(state)
# print(state.shape)
# action, log_prob = agent.act(state)
# action.shape
