In [None]:
# Important: Code is mostly re-used from https://github.com/moduIo/Deep-Q-network/blob/master/DQN.ipynb
# Code may change with more clean-ups and explanation
import gym
from collections import deque

# Agent

In [None]:
from dqn_agent import DQN_Agent

# Preprocessing

In [None]:
from utils import blend_images, process_frame

# Environment

In [None]:
env = gym.make('Breakout-v4', render_mode="rgb_array")
state_size = (105, 80, 1)
action_size = env.action_space.n
agent = DQN_Agent(state_size, action_size)

episodes = 50
batch_size = 64
skip_start = 90  # Breakout-v0 waits for 90 actions before the episode begins
total_time = 0   # Counter for total number of steps taken
all_rewards = 0  # Used to compute avg reward over time
blend = 4        # Number of images to blend
done = False

In [None]:
blend = 4
max_steps_per_episode = 2000
for e in range(episodes):
    total_reward = 0
    game_score = 0
    state_reset = env.reset()
    # print(state_reset)
    state = process_frame(state_reset[0])
    images = deque(maxlen=blend)  # Array of images to be blended
    images.append(state)
    
    for skip in range(skip_start): # skip the start of each game
        env.step(0)
    
    # generate an episode
    for time in range(max_steps_per_episode):
        total_time += 1
        
        # Every update_rate timesteps we update the target network parameters
        if total_time % agent.update_rate == 0:
            # TODO: Update the target model by copying weights from Qnetwork
            agent.update_target_model()
        
        # Return the avg of the last 4 frames
        state = blend_images(images, state_size, blend)
        
        # TODO: Choose and apply action
        action = 0
        next_state, reward, done, _, _ = env.step(action)
        
        # TODO: Process the frame and save it to memory
        
        # Update state, and rewards
        state = next_state
        game_score += reward
        total_reward += reward
        
        if done:
            all_rewards += game_score
            
            print("episode: {}/{}, game score: {}, reward: {}, avg reward: {}, time: {}, total time: {}"
                  .format(e+1, episodes, game_score, total_reward, all_rewards/(e+1), time, total_time))   
            break

    if len(agent.memory) > batch_size:
        # TODO train
        pass

    # TODO: Save model every n (try 10 or 25) episodes



In [None]:
# agent.save('models/5k-memory_100-games')

In [None]:
import time
!pip install IPython
!pip install matplotlib
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render())
    plt.title("Step: %d %s" % (step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

game_score = 0
done = False
reward = 0
env.reset()
for t in range(2000):
    for skip in range(skip_start): # skip the start of each game
        env.step(0)
    show_state(env, t)
    total_time += 1
    
    # Return the avg of the last 4 frames
    state = blend_images(images, state_size, blend)
    
    # Transition Dynamics
    action = agent.greedy_act(state)
    next_state, reward, done, _, _ = env.step(action)
    
    # Return the avg of the last 4 frames
    next_state = process_frame(next_state)
    images.append(next_state)
    next_state = blend_images(images, state_size, blend)
        
    state = next_state
    game_score += reward
    reward -= 1  # Punish behavior which does not accumulate reward
    total_reward += reward
    time.sleep(0.05)
    if done:
        all_rewards += game_score
        
        print("game score: {}, reward: {}"
                .format(game_score, total_reward))
        
        break