In [1]:
# Important: Code is mostly re-used from https://github.com/moduIo/Deep-Q-network/blob/master/DQN.ipynb
# Code may change with more clean-ups and explanation
import gym
from collections import deque

# Agent

In [2]:
from dqn_agent import DQN_Agent




# Preprocessing

In [3]:
from utils import blend_images, process_rgb, process_frame

# Environment

In [4]:
env = gym.make('PongDeterministic-v4', render_mode="rgb_array")
state_size = (105, 80, 1)
action_size = env.action_space.n
agent = DQN_Agent(state_size, action_size)

load_model_name = ""
if load_model_name != "":
    agent.load(load_model_name)

episodes = 200
batch_size = 8
skip_start = 90  # Breakout-v0 waits for 90 actions before the episode begins
total_time = 0   # Counter for total number of steps taken
all_rewards = 0  # Used to compute avg reward over time
blend = 4        # Number of images to blend
done = False







Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 27, 20, 32)        2080      
                                                                 
 activation (Activation)     (None, 27, 20, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 14, 10, 64)        32832     
                                                                 
 activation_1 (Activation)   (None, 14, 10, 64)        0         
                                                                 
 conv2d_2 (Conv2D)           (None, 14, 10, 64)        36928     
                                                                 
 activation_2 (Activation)   (None, 14, 10, 64)        0         
                                                                 
 flatten (Flatten)           (None, 8960)              0

In [5]:
blend = 4
for e in range(episodes):
    total_reward = 0
    game_score = 0
    sum_10_games = 0
    state_reset = env.reset()
    state = process_frame(state_reset[0])
    images = deque(maxlen=blend)  # Array of images to be blended
    images.append(state)
    
    for skip in range(skip_start): # skip the start of each game
        env.step(0)
    
    for time in range(2500):
        # env.render()
        total_time += 1
        
        
        # Return the avg of the last 4 frames
        state = blend_images(images, state_size, blend)
        
        # Transition Dynamics
        action = agent.act(state)
        next_state, reward, done, _, _ = env.step(action)
        
        # Return the avg of the last 4 frames
        next_state = process_frame(next_state)
        images.append(next_state)
        next_state = blend_images(images, state_size, blend)
        
        # Store sequence in replay memory
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        game_score += reward
        total_reward += reward
        if done:
            all_rewards += game_score
            sum_10_games += game_score
            avg_10_games = sum_10_games/(e%10 + 1)
            
            print("episode: {}/{}, game score: {}, avg_10: {} reward: {}, avg reward: {}, time: {}, total time: {}"
                  .format(e+1, episodes, game_score, avg_10_games, total_reward, all_rewards/(e+1), time, total_time))
            
            break

        if len(agent.memory) > 2000 and total_time % 25  == 0:
            agent.replay(batch_size)
            
    # Every update_rate timesteps we update the target network parameters
    # if total_time % agent.update_rate == 0:
    if total_time % 10:
        agent.update_target_model()

    
    if e % 10 == 0:
        fname = f'models/10k-memory_{e}-games'
        print(f'Saving: {fname}')
        agent.save(fname)
        sum_10_games = 0
    if game_score > 15.0:
        break

  if not isinstance(terminated, (bool, np.bool8)):


episode: 1/200, game score: -20.0, avg_10: -20.0 reward: -20.0, avg reward: -20.0, time: 707, total time: 708
Saving: models/10k-memory_0-games
episode: 2/200, game score: -19.0, avg_10: -9.5 reward: -19.0, avg reward: -19.5, time: 856, total time: 1565






episode: 3/200, game score: -20.0, avg_10: -6.666666666666667 reward: -20.0, avg reward: -19.666666666666668, time: 673, total time: 2239
episode: 4/200, game score: -19.0, avg_10: -4.75 reward: -19.0, avg reward: -19.5, time: 858, total time: 3098
episode: 5/200, game score: -18.0, avg_10: -3.6 reward: -18.0, avg reward: -19.2, time: 956, total time: 4055
episode: 6/200, game score: -17.0, avg_10: -2.8333333333333335 reward: -17.0, avg reward: -18.833333333333332, time: 971, total time: 5027
episode: 7/200, game score: -20.0, avg_10: -2.857142857142857 reward: -20.0, avg reward: -19.0, time: 673, total time: 5701
episode: 8/200, game score: -19.0, avg_10: -2.375 reward: -19.0, avg reward: -19.0, time: 808, total time: 6510
episode: 9/200, game score: -19.0, avg_10: -2.111111111111111 reward: -19.0, avg reward: -19.0, time: 747, total time: 7258
episode: 10/200, game score: -20.0, avg_10: -2.0 reward: -20.0, avg reward: -19.1, time: 733, total time: 7992


KeyboardInterrupt: 

In [None]:
agent.save('models/5k-memory_100-games')

In [None]:
import time
!pip install IPython
!pip install matplotlib
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render())
    plt.title("Step: %d %s" % (step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

game_score = 0
done = False
reward = 0
env.reset()
for t in range(2000):
    for skip in range(skip_start): # skip the start of each game
        env.step(0)
    show_state(env, t)
    total_time += 1
    
    # Return the avg of the last 4 frames
    state = blend_images(images, state_size, blend)
    
    # Transition Dynamics
    action = agent.greedy_act(state)
    next_state, reward, done, _, _ = env.step(action)
    
    # Return the avg of the last 4 frames
    next_state = process_frame(next_state)
    images.append(next_state)
    next_state = blend_images(images, state_size, blend)
        
    state = next_state
    game_score += reward
    total_reward += reward
    time.sleep(0.05)
    if done:
        all_rewards += game_score
        
        print("game score: {}, reward: {}"
                .format(game_score, total_reward))
        
        break