In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('LunarLander-v2')
env.seed(0)
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
State shape:  (8,)
Number of actions:  4


In [2]:
from ddqn_dual_agent import ddqn_dual_Agent

In [3]:
agent = ddqn_dual_Agent(state_size=8, action_size=4, seed=0)

In [4]:
def ddqn_dual(n_episodes=4000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores_mean_w=[]
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    scores_window_10 = deque(maxlen=10)
   
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
   
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        scores_window_10.append(score)      
        
 #       print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
 #       somme=agent.wQ1+agent.wQ2+0.0000001
 #       print("qQ1=",agent.wQ1/somme," qQ2=",agent.wQ2/somme)
 #       print()
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if i_episode % 10 == 0:            
            scores_mean_w.append(np.mean(scores_window))
            print("score",np.mean(scores_window_10))
        if np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_Qa.state_dict(), 'ddqn_checkpoint_test_a.pth')
            torch.save(agent.qnetwork_Qb.state_dict(), 'ddqn_checkpoint_test_b.pth')            
            break
    return scores,scores_mean_w

scores,scores_mean_w = ddqn_dual()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len( scores)),  scores)
plt.ylabel('ddqn_dual_Score')
plt.xlabel('Episode #')
plt.show()

ax = fig.add_subplot(111)
plt.plot(np.arange(len( scores_mean_w))*10, scores_mean_w)
plt.ylabel('ddqn_dual Score mean over 100 every 10')
plt.xlabel('Episode #')
plt.show()



score -248.91274311381358
score -289.42021504839005
score -191.27386037520677
score -284.9531104307577
score -163.840330535614
score -225.9568580447395
score -161.21265389055924
score -238.85707642922162
score -171.877690800205
Episode 100	Average Score: -215.46
score -178.2686392682866
score -228.15227822375635
score -186.77722100817604
score -194.89210391870563
score -160.41340179358676
score -208.7711025102629
score -187.705863853837
score -121.66793543953747
score -181.9810048576639
score -137.38825949935236
Episode 200	Average Score: -175.32
score -145.46766603986984
score -154.66287026367544
score -149.40696438379922
score -179.29991356746382
score -151.6507035173332
score -164.52795673838256
score -167.04877190440044
score -181.35890078122242
score -150.7030981255447
score -164.54186432516627
Episode 300	Average Score: -162.07
score -157.4490107038776
score -156.37257848791722
score -186.75558773148455
score -129.29015910377285
score -166.04702955449542
score -161.79920482378415

score 25.68124027969501
score 68.35922266684102
score 53.24831548769764
score 161.55309851885212
score 181.53333365339637
score 89.25687026367181
score 146.4697279022676
Episode 2900	Average Score: 102.03
score 162.45261510329448
score 160.591900321196
score 215.3296644036813
score 170.4653339166914
score 190.0660807563218
score 121.74317448739323
score 196.77891123931357
score 90.08086763976947
score 183.6347484652339
score 202.95866459287964
Episode 3000	Average Score: 167.95
score 147.84204777154204
score 193.98427412457994
score 202.26804665039026
score 170.10507733367098
score 176.75280707153462
score 196.79266536937297
score 197.93780661060003
score 166.7162317233932
score 136.8250518098665
score 147.24798992197339
Episode 3100	Average Score: 175.41
score 165.48571632626175
score 188.3566154134489
score 174.43512672956396
score 141.58860788237695
score 153.91555386393048
score 202.16644715354172
score 215.05946925424797
score 221.75780592565008
score 221.6205564269896
score 197.8

AttributeError: 'ddqn_dual_Agent' object has no attribute 'qnetwork_Qa'