In [1]:
import gym
import numpy as np
rm='Pong-v0'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

In [3]:
from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make(rm)
# model initialization
H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):
  # preprocess the observation, set input to network to be difference image
  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  f"The environment {id} is out of date. You should consider "
  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


In [4]:
%time hist1 = train_model(env, model, total_episodes=6000)

  "Core environment is written in old step API which returns one bool instead of two. "


episode 1.000000, reward total was -19.000000. running mean: -19.000000
episode 2.000000, reward total was -21.000000. running mean: -19.020000
episode 3.000000, reward total was -21.000000. running mean: -19.039800
episode 4.000000, reward total was -20.000000. running mean: -19.049402
episode 5.000000, reward total was -21.000000. running mean: -19.068908
episode 6.000000, reward total was -20.000000. running mean: -19.078219
episode 7.000000, reward total was -21.000000. running mean: -19.097437
episode 8.000000, reward total was -19.000000. running mean: -19.096462
episode 9.000000, reward total was -19.000000. running mean: -19.095498
episode 10.000000, reward total was -19.000000. running mean: -19.094543
episode 11.000000, reward total was -20.000000. running mean: -19.103597
episode 12.000000, reward total was -19.000000. running mean: -19.102561
episode 13.000000, reward total was -21.000000. running mean: -19.121536
episode 14.000000, reward total was -20.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -19.895326
episode 115.000000, reward total was -21.000000. running mean: -19.906372
episode 116.000000, reward total was -21.000000. running mean: -19.917309
episode 117.000000, reward total was -20.000000. running mean: -19.918135
episode 118.000000, reward total was -19.000000. running mean: -19.908954
episode 119.000000, reward total was -19.000000. running mean: -19.899865
episode 120.000000, reward total was -20.000000. running mean: -19.900866
episode 121.000000, reward total was -18.000000. running mean: -19.881857
episode 122.000000, reward total was -18.000000. running mean: -19.863039
episode 123.000000, reward total was -20.000000. running mean: -19.864408
episode 124.000000, reward total was -20.000000. running mean: -19.865764
episode 125.000000, reward total was -20.000000. running mean: -19.867107
episode 126.000000, reward total was -21.000000. running mean: -19.878436
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.076111
episode 226.000000, reward total was -20.000000. running mean: -20.075350
episode 227.000000, reward total was -21.000000. running mean: -20.084596
episode 228.000000, reward total was -20.000000. running mean: -20.083750
episode 229.000000, reward total was -20.000000. running mean: -20.082913
episode 230.000000, reward total was -20.000000. running mean: -20.082084
episode 231.000000, reward total was -16.000000. running mean: -20.041263
episode 232.000000, reward total was -20.000000. running mean: -20.040850
episode 233.000000, reward total was -20.000000. running mean: -20.040442
episode 234.000000, reward total was -17.000000. running mean: -20.010037
episode 235.000000, reward total was -17.000000. running mean: -19.979937
episode 236.000000, reward total was -19.000000. running mean: -19.970137
episode 237.000000, reward total was -19.000000. running mean: -19.960436
episode 238.000000, reward total was -

episode 336.000000, reward total was -15.000000. running mean: -19.858853
episode 337.000000, reward total was -19.000000. running mean: -19.850265
episode 338.000000, reward total was -20.000000. running mean: -19.851762
episode 339.000000, reward total was -19.000000. running mean: -19.843244
episode 340.000000, reward total was -18.000000. running mean: -19.824812
episode 341.000000, reward total was -20.000000. running mean: -19.826564
episode 342.000000, reward total was -21.000000. running mean: -19.838298
episode 343.000000, reward total was -21.000000. running mean: -19.849915
episode 344.000000, reward total was -21.000000. running mean: -19.861416
episode 345.000000, reward total was -20.000000. running mean: -19.862802
episode 346.000000, reward total was -21.000000. running mean: -19.874174
episode 347.000000, reward total was -17.000000. running mean: -19.845432
episode 348.000000, reward total was -20.000000. running mean: -19.846978
episode 349.000000, reward total was -

episode 447.000000, reward total was -21.000000. running mean: -19.686736
episode 448.000000, reward total was -20.000000. running mean: -19.689869
episode 449.000000, reward total was -17.000000. running mean: -19.662970
episode 450.000000, reward total was -18.000000. running mean: -19.646340
episode 451.000000, reward total was -19.000000. running mean: -19.639877
episode 452.000000, reward total was -19.000000. running mean: -19.633478
episode 453.000000, reward total was -20.000000. running mean: -19.637143
episode 454.000000, reward total was -20.000000. running mean: -19.640772
episode 455.000000, reward total was -21.000000. running mean: -19.654364
episode 456.000000, reward total was -20.000000. running mean: -19.657820
episode 457.000000, reward total was -19.000000. running mean: -19.651242
episode 458.000000, reward total was -18.000000. running mean: -19.634730
episode 459.000000, reward total was -18.000000. running mean: -19.618383
episode 460.000000, reward total was -

episode 558.000000, reward total was -18.000000. running mean: -19.461539
episode 559.000000, reward total was -21.000000. running mean: -19.476924
episode 560.000000, reward total was -20.000000. running mean: -19.482155
episode 561.000000, reward total was -18.000000. running mean: -19.467333
episode 562.000000, reward total was -20.000000. running mean: -19.472660
episode 563.000000, reward total was -20.000000. running mean: -19.477933
episode 564.000000, reward total was -18.000000. running mean: -19.463154
episode 565.000000, reward total was -19.000000. running mean: -19.458522
episode 566.000000, reward total was -17.000000. running mean: -19.433937
episode 567.000000, reward total was -18.000000. running mean: -19.419598
episode 568.000000, reward total was -19.000000. running mean: -19.415402
episode 569.000000, reward total was -18.000000. running mean: -19.401248
episode 570.000000, reward total was -20.000000. running mean: -19.407235
episode 571.000000, reward total was -

episode 669.000000, reward total was -18.000000. running mean: -19.150249
episode 670.000000, reward total was -20.000000. running mean: -19.158747
episode 671.000000, reward total was -20.000000. running mean: -19.167159
episode 672.000000, reward total was -17.000000. running mean: -19.145488
episode 673.000000, reward total was -19.000000. running mean: -19.144033
episode 674.000000, reward total was -20.000000. running mean: -19.152592
episode 675.000000, reward total was -20.000000. running mean: -19.161066
episode 676.000000, reward total was -19.000000. running mean: -19.159456
episode 677.000000, reward total was -21.000000. running mean: -19.177861
episode 678.000000, reward total was -18.000000. running mean: -19.166083
episode 679.000000, reward total was -18.000000. running mean: -19.154422
episode 680.000000, reward total was -20.000000. running mean: -19.162878
episode 681.000000, reward total was -17.000000. running mean: -19.141249
episode 682.000000, reward total was -

episode 780.000000, reward total was -21.000000. running mean: -18.620221
episode 781.000000, reward total was -21.000000. running mean: -18.644019
episode 782.000000, reward total was -18.000000. running mean: -18.637578
episode 783.000000, reward total was -16.000000. running mean: -18.611203
episode 784.000000, reward total was -20.000000. running mean: -18.625091
episode 785.000000, reward total was -17.000000. running mean: -18.608840
episode 786.000000, reward total was -21.000000. running mean: -18.632751
episode 787.000000, reward total was -20.000000. running mean: -18.646424
episode 788.000000, reward total was -18.000000. running mean: -18.639960
episode 789.000000, reward total was -15.000000. running mean: -18.603560
episode 790.000000, reward total was -17.000000. running mean: -18.587524
episode 791.000000, reward total was -20.000000. running mean: -18.601649
episode 792.000000, reward total was -21.000000. running mean: -18.625633
episode 793.000000, reward total was -

episode 891.000000, reward total was -19.000000. running mean: -18.187291
episode 892.000000, reward total was -16.000000. running mean: -18.165418
episode 893.000000, reward total was -21.000000. running mean: -18.193764
episode 894.000000, reward total was -16.000000. running mean: -18.171827
episode 895.000000, reward total was -13.000000. running mean: -18.120108
episode 896.000000, reward total was -19.000000. running mean: -18.128907
episode 897.000000, reward total was -18.000000. running mean: -18.127618
episode 898.000000, reward total was -16.000000. running mean: -18.106342
episode 899.000000, reward total was -16.000000. running mean: -18.085279
episode 900.000000, reward total was -18.000000. running mean: -18.084426
episode 901.000000, reward total was -18.000000. running mean: -18.083581
episode 902.000000, reward total was -20.000000. running mean: -18.102746
episode 903.000000, reward total was -19.000000. running mean: -18.111718
episode 904.000000, reward total was -

episode 1002.000000, reward total was -15.000000. running mean: -17.716129
episode 1003.000000, reward total was -17.000000. running mean: -17.708967
episode 1004.000000, reward total was -15.000000. running mean: -17.681878
episode 1005.000000, reward total was -20.000000. running mean: -17.705059
episode 1006.000000, reward total was -14.000000. running mean: -17.668008
episode 1007.000000, reward total was -19.000000. running mean: -17.681328
episode 1008.000000, reward total was -16.000000. running mean: -17.664515
episode 1009.000000, reward total was -20.000000. running mean: -17.687870
episode 1010.000000, reward total was -18.000000. running mean: -17.690991
episode 1011.000000, reward total was -17.000000. running mean: -17.684081
episode 1012.000000, reward total was -16.000000. running mean: -17.667240
episode 1013.000000, reward total was -14.000000. running mean: -17.630568
episode 1014.000000, reward total was -18.000000. running mean: -17.634262
episode 1015.000000, rewa

episode 1112.000000, reward total was -17.000000. running mean: -17.064726
episode 1113.000000, reward total was -10.000000. running mean: -16.994078
episode 1114.000000, reward total was -16.000000. running mean: -16.984137
episode 1115.000000, reward total was -17.000000. running mean: -16.984296
episode 1116.000000, reward total was -12.000000. running mean: -16.934453
episode 1117.000000, reward total was -19.000000. running mean: -16.955109
episode 1118.000000, reward total was -19.000000. running mean: -16.975558
episode 1119.000000, reward total was -15.000000. running mean: -16.955802
episode 1120.000000, reward total was -17.000000. running mean: -16.956244
episode 1121.000000, reward total was -16.000000. running mean: -16.946681
episode 1122.000000, reward total was -18.000000. running mean: -16.957215
episode 1123.000000, reward total was -13.000000. running mean: -16.917643
episode 1124.000000, reward total was -16.000000. running mean: -16.908466
episode 1125.000000, rewa

episode 1222.000000, reward total was -19.000000. running mean: -16.407609
episode 1223.000000, reward total was -20.000000. running mean: -16.443533
episode 1224.000000, reward total was -16.000000. running mean: -16.439097
episode 1225.000000, reward total was -15.000000. running mean: -16.424706
episode 1226.000000, reward total was -19.000000. running mean: -16.450459
episode 1227.000000, reward total was -17.000000. running mean: -16.455955
episode 1228.000000, reward total was -17.000000. running mean: -16.461395
episode 1229.000000, reward total was -18.000000. running mean: -16.476781
episode 1230.000000, reward total was -15.000000. running mean: -16.462013
episode 1231.000000, reward total was -18.000000. running mean: -16.477393
episode 1232.000000, reward total was -21.000000. running mean: -16.522619
episode 1233.000000, reward total was -13.000000. running mean: -16.487393
episode 1234.000000, reward total was -21.000000. running mean: -16.532519
episode 1235.000000, rewa

episode 1332.000000, reward total was -14.000000. running mean: -15.793651
episode 1333.000000, reward total was -16.000000. running mean: -15.795714
episode 1334.000000, reward total was -17.000000. running mean: -15.807757
episode 1335.000000, reward total was -13.000000. running mean: -15.779679
episode 1336.000000, reward total was -17.000000. running mean: -15.791883
episode 1337.000000, reward total was -12.000000. running mean: -15.753964
episode 1338.000000, reward total was -19.000000. running mean: -15.786424
episode 1339.000000, reward total was -11.000000. running mean: -15.738560
episode 1340.000000, reward total was -16.000000. running mean: -15.741174
episode 1341.000000, reward total was -19.000000. running mean: -15.773763
episode 1342.000000, reward total was -17.000000. running mean: -15.786025
episode 1343.000000, reward total was -12.000000. running mean: -15.748165
episode 1344.000000, reward total was -19.000000. running mean: -15.780683
episode 1345.000000, rewa

episode 1442.000000, reward total was -8.000000. running mean: -15.267857
episode 1443.000000, reward total was -13.000000. running mean: -15.245179
episode 1444.000000, reward total was -15.000000. running mean: -15.242727
episode 1445.000000, reward total was -19.000000. running mean: -15.280300
episode 1446.000000, reward total was -12.000000. running mean: -15.247497
episode 1447.000000, reward total was -13.000000. running mean: -15.225022
episode 1448.000000, reward total was -14.000000. running mean: -15.212772
episode 1449.000000, reward total was -9.000000. running mean: -15.150644
episode 1450.000000, reward total was -12.000000. running mean: -15.119137
episode 1451.000000, reward total was -10.000000. running mean: -15.067946
episode 1452.000000, reward total was -14.000000. running mean: -15.057267
episode 1453.000000, reward total was -17.000000. running mean: -15.076694
episode 1454.000000, reward total was -20.000000. running mean: -15.125927
episode 1455.000000, reward

episode 1552.000000, reward total was -7.000000. running mean: -14.765682
episode 1553.000000, reward total was -13.000000. running mean: -14.748025
episode 1554.000000, reward total was -16.000000. running mean: -14.760545
episode 1555.000000, reward total was -11.000000. running mean: -14.722940
episode 1556.000000, reward total was -18.000000. running mean: -14.755710
episode 1557.000000, reward total was -15.000000. running mean: -14.758153
episode 1558.000000, reward total was -14.000000. running mean: -14.750572
episode 1559.000000, reward total was -8.000000. running mean: -14.683066
episode 1560.000000, reward total was -17.000000. running mean: -14.706235
episode 1561.000000, reward total was -9.000000. running mean: -14.649173
episode 1562.000000, reward total was -18.000000. running mean: -14.682681
episode 1563.000000, reward total was -9.000000. running mean: -14.625854
episode 1564.000000, reward total was -20.000000. running mean: -14.679596
episode 1565.000000, reward t

episode 1662.000000, reward total was -12.000000. running mean: -14.297564
episode 1663.000000, reward total was -17.000000. running mean: -14.324588
episode 1664.000000, reward total was -12.000000. running mean: -14.301342
episode 1665.000000, reward total was -9.000000. running mean: -14.248329
episode 1666.000000, reward total was -15.000000. running mean: -14.255845
episode 1667.000000, reward total was -9.000000. running mean: -14.203287
episode 1668.000000, reward total was -14.000000. running mean: -14.201254
episode 1669.000000, reward total was -17.000000. running mean: -14.229242
episode 1670.000000, reward total was -15.000000. running mean: -14.236949
episode 1671.000000, reward total was -10.000000. running mean: -14.194580
episode 1672.000000, reward total was -12.000000. running mean: -14.172634
episode 1673.000000, reward total was -8.000000. running mean: -14.110908
episode 1674.000000, reward total was -19.000000. running mean: -14.159798
episode 1675.000000, reward 

episode 1772.000000, reward total was -13.000000. running mean: -13.434444
episode 1773.000000, reward total was -11.000000. running mean: -13.410100
episode 1774.000000, reward total was -15.000000. running mean: -13.425999
episode 1775.000000, reward total was -19.000000. running mean: -13.481739
episode 1776.000000, reward total was -8.000000. running mean: -13.426921
episode 1777.000000, reward total was -15.000000. running mean: -13.442652
episode 1778.000000, reward total was -13.000000. running mean: -13.438225
episode 1779.000000, reward total was -13.000000. running mean: -13.433843
episode 1780.000000, reward total was -18.000000. running mean: -13.479505
episode 1781.000000, reward total was -10.000000. running mean: -13.444710
episode 1782.000000, reward total was -18.000000. running mean: -13.490263
episode 1783.000000, reward total was -16.000000. running mean: -13.515360
episode 1784.000000, reward total was -13.000000. running mean: -13.510206
episode 1785.000000, rewar

episode 1882.000000, reward total was -13.000000. running mean: -13.019871
episode 1883.000000, reward total was -19.000000. running mean: -13.079672
episode 1884.000000, reward total was -13.000000. running mean: -13.078875
episode 1885.000000, reward total was -9.000000. running mean: -13.038086
episode 1886.000000, reward total was -12.000000. running mean: -13.027706
episode 1887.000000, reward total was -13.000000. running mean: -13.027429
episode 1888.000000, reward total was -15.000000. running mean: -13.047154
episode 1889.000000, reward total was -11.000000. running mean: -13.026683
episode 1890.000000, reward total was -18.000000. running mean: -13.076416
episode 1891.000000, reward total was -11.000000. running mean: -13.055652
episode 1892.000000, reward total was -11.000000. running mean: -13.035095
episode 1893.000000, reward total was -7.000000. running mean: -12.974744
episode 1894.000000, reward total was -17.000000. running mean: -13.014997
episode 1895.000000, reward

episode 1992.000000, reward total was -16.000000. running mean: -12.015324
episode 1993.000000, reward total was -5.000000. running mean: -11.945171
episode 1994.000000, reward total was -5.000000. running mean: -11.875719
episode 1995.000000, reward total was -9.000000. running mean: -11.846962
episode 1996.000000, reward total was -12.000000. running mean: -11.848492
episode 1997.000000, reward total was -17.000000. running mean: -11.900007
episode 1998.000000, reward total was -20.000000. running mean: -11.981007
episode 1999.000000, reward total was -2.000000. running mean: -11.881197
episode 2000.000000, reward total was -3.000000. running mean: -11.792385
episode 2001.000000, reward total was -14.000000. running mean: -11.814461
episode 2002.000000, reward total was -9.000000. running mean: -11.786317
episode 2003.000000, reward total was -10.000000. running mean: -11.768454
episode 2004.000000, reward total was -8.000000. running mean: -11.730769
episode 2005.000000, reward tota

episode 2102.000000, reward total was -1.000000. running mean: -11.240212
episode 2103.000000, reward total was -20.000000. running mean: -11.327810
episode 2104.000000, reward total was -16.000000. running mean: -11.374532
episode 2105.000000, reward total was -8.000000. running mean: -11.340787
episode 2106.000000, reward total was -12.000000. running mean: -11.347379
episode 2107.000000, reward total was -10.000000. running mean: -11.333905
episode 2108.000000, reward total was -13.000000. running mean: -11.350566
episode 2109.000000, reward total was -14.000000. running mean: -11.377060
episode 2110.000000, reward total was -14.000000. running mean: -11.403290
episode 2111.000000, reward total was -7.000000. running mean: -11.359257
episode 2112.000000, reward total was -15.000000. running mean: -11.395664
episode 2113.000000, reward total was -8.000000. running mean: -11.361708
episode 2114.000000, reward total was -14.000000. running mean: -11.388091
episode 2115.000000, reward t

episode 2212.000000, reward total was -13.000000. running mean: -11.088341
episode 2213.000000, reward total was -10.000000. running mean: -11.077458
episode 2214.000000, reward total was -16.000000. running mean: -11.126683
episode 2215.000000, reward total was -14.000000. running mean: -11.155416
episode 2216.000000, reward total was -6.000000. running mean: -11.103862
episode 2217.000000, reward total was -15.000000. running mean: -11.142823
episode 2218.000000, reward total was -7.000000. running mean: -11.101395
episode 2219.000000, reward total was -10.000000. running mean: -11.090381
episode 2220.000000, reward total was -6.000000. running mean: -11.039477
episode 2221.000000, reward total was -13.000000. running mean: -11.059083
episode 2222.000000, reward total was -8.000000. running mean: -11.028492
episode 2223.000000, reward total was -12.000000. running mean: -11.038207
episode 2224.000000, reward total was -7.000000. running mean: -10.997825
episode 2225.000000, reward to

episode 2322.000000, reward total was -17.000000. running mean: -10.541691
episode 2323.000000, reward total was -15.000000. running mean: -10.586274
episode 2324.000000, reward total was -10.000000. running mean: -10.580411
episode 2325.000000, reward total was -7.000000. running mean: -10.544607
episode 2326.000000, reward total was -12.000000. running mean: -10.559161
episode 2327.000000, reward total was -1.000000. running mean: -10.463569
episode 2328.000000, reward total was -9.000000. running mean: -10.448934
episode 2329.000000, reward total was -9.000000. running mean: -10.434444
episode 2330.000000, reward total was -3.000000. running mean: -10.360100
episode 2331.000000, reward total was -9.000000. running mean: -10.346499
episode 2332.000000, reward total was -11.000000. running mean: -10.353034
episode 2333.000000, reward total was -11.000000. running mean: -10.359503
episode 2334.000000, reward total was -8.000000. running mean: -10.335908
episode 2335.000000, reward tota

episode 2433.000000, reward total was -9.000000. running mean: -9.418493
episode 2434.000000, reward total was -14.000000. running mean: -9.464308
episode 2435.000000, reward total was -2.000000. running mean: -9.389665
episode 2436.000000, reward total was -15.000000. running mean: -9.445768
episode 2437.000000, reward total was -11.000000. running mean: -9.461311
episode 2438.000000, reward total was -7.000000. running mean: -9.436698
episode 2439.000000, reward total was -17.000000. running mean: -9.512331
episode 2440.000000, reward total was -8.000000. running mean: -9.497207
episode 2441.000000, reward total was -7.000000. running mean: -9.472235
episode 2442.000000, reward total was -16.000000. running mean: -9.537513
episode 2443.000000, reward total was -9.000000. running mean: -9.532138
episode 2444.000000, reward total was -11.000000. running mean: -9.546816
episode 2445.000000, reward total was -13.000000. running mean: -9.581348
episode 2446.000000, reward total was -5.000

episode 2545.000000, reward total was -7.000000. running mean: -9.385756
episode 2546.000000, reward total was -2.000000. running mean: -9.311898
episode 2547.000000, reward total was -11.000000. running mean: -9.328779
episode 2548.000000, reward total was -8.000000. running mean: -9.315492
episode 2549.000000, reward total was -12.000000. running mean: -9.342337
episode 2550.000000, reward total was -10.000000. running mean: -9.348913
episode 2551.000000, reward total was -17.000000. running mean: -9.425424
episode 2552.000000, reward total was -11.000000. running mean: -9.441170
episode 2553.000000, reward total was -11.000000. running mean: -9.456758
episode 2554.000000, reward total was -15.000000. running mean: -9.512191
episode 2555.000000, reward total was -8.000000. running mean: -9.497069
episode 2556.000000, reward total was -9.000000. running mean: -9.492098
episode 2557.000000, reward total was -2.000000. running mean: -9.417177
episode 2558.000000, reward total was -10.00

episode 2657.000000, reward total was -14.000000. running mean: -9.283272
episode 2658.000000, reward total was -6.000000. running mean: -9.250439
episode 2659.000000, reward total was -5.000000. running mean: -9.207935
episode 2660.000000, reward total was -14.000000. running mean: -9.255855
episode 2661.000000, reward total was -7.000000. running mean: -9.233297
episode 2662.000000, reward total was -10.000000. running mean: -9.240964
episode 2663.000000, reward total was -14.000000. running mean: -9.288554
episode 2664.000000, reward total was -7.000000. running mean: -9.265669
episode 2665.000000, reward total was -12.000000. running mean: -9.293012
episode 2666.000000, reward total was -11.000000. running mean: -9.310082
episode 2667.000000, reward total was -12.000000. running mean: -9.336981
episode 2668.000000, reward total was -11.000000. running mean: -9.353611
episode 2669.000000, reward total was -9.000000. running mean: -9.350075
episode 2670.000000, reward total was -8.00

episode 2769.000000, reward total was -13.000000. running mean: -9.254058
episode 2770.000000, reward total was -7.000000. running mean: -9.231518
episode 2771.000000, reward total was -11.000000. running mean: -9.249202
episode 2772.000000, reward total was 6.000000. running mean: -9.096710
episode 2773.000000, reward total was -13.000000. running mean: -9.135743
episode 2774.000000, reward total was -14.000000. running mean: -9.184386
episode 2775.000000, reward total was -12.000000. running mean: -9.212542
episode 2776.000000, reward total was -2.000000. running mean: -9.140417
episode 2777.000000, reward total was -10.000000. running mean: -9.149012
episode 2778.000000, reward total was -11.000000. running mean: -9.167522
episode 2779.000000, reward total was -3.000000. running mean: -9.105847
episode 2780.000000, reward total was -16.000000. running mean: -9.174789
episode 2781.000000, reward total was -13.000000. running mean: -9.213041
episode 2782.000000, reward total was -13.0

episode 2881.000000, reward total was -13.000000. running mean: -9.062506
episode 2882.000000, reward total was -10.000000. running mean: -9.071881
episode 2883.000000, reward total was -6.000000. running mean: -9.041163
episode 2884.000000, reward total was -4.000000. running mean: -8.990751
episode 2885.000000, reward total was -3.000000. running mean: -8.930843
episode 2886.000000, reward total was -9.000000. running mean: -8.931535
episode 2887.000000, reward total was 3.000000. running mean: -8.812220
episode 2888.000000, reward total was -2.000000. running mean: -8.744097
episode 2889.000000, reward total was -11.000000. running mean: -8.766656
episode 2890.000000, reward total was -2.000000. running mean: -8.698990
episode 2891.000000, reward total was -9.000000. running mean: -8.702000
episode 2892.000000, reward total was -10.000000. running mean: -8.714980
episode 2893.000000, reward total was -12.000000. running mean: -8.747830
episode 2894.000000, reward total was -6.000000

episode 2993.000000, reward total was -7.000000. running mean: -7.722380
episode 2994.000000, reward total was -12.000000. running mean: -7.765156
episode 2995.000000, reward total was -3.000000. running mean: -7.717504
episode 2996.000000, reward total was -13.000000. running mean: -7.770329
episode 2997.000000, reward total was -14.000000. running mean: -7.832626
episode 2998.000000, reward total was -13.000000. running mean: -7.884300
episode 2999.000000, reward total was -11.000000. running mean: -7.915457
episode 3000.000000, reward total was -9.000000. running mean: -7.926302
episode 3001.000000, reward total was 5.000000. running mean: -7.797039
episode 3002.000000, reward total was 3.000000. running mean: -7.689069
episode 3003.000000, reward total was -7.000000. running mean: -7.682178
episode 3004.000000, reward total was -7.000000. running mean: -7.675356
episode 3005.000000, reward total was -15.000000. running mean: -7.748603
episode 3006.000000, reward total was -7.000000

episode 3105.000000, reward total was -9.000000. running mean: -8.245038
episode 3106.000000, reward total was -11.000000. running mean: -8.272587
episode 3107.000000, reward total was -8.000000. running mean: -8.269861
episode 3108.000000, reward total was -9.000000. running mean: -8.277163
episode 3109.000000, reward total was -9.000000. running mean: -8.284391
episode 3110.000000, reward total was -9.000000. running mean: -8.291547
episode 3111.000000, reward total was -10.000000. running mean: -8.308632
episode 3112.000000, reward total was 5.000000. running mean: -8.175545
episode 3113.000000, reward total was -11.000000. running mean: -8.203790
episode 3114.000000, reward total was -7.000000. running mean: -8.191752
episode 3115.000000, reward total was -13.000000. running mean: -8.239834
episode 3116.000000, reward total was 1.000000. running mean: -8.147436
episode 3117.000000, reward total was -5.000000. running mean: -8.115962
episode 3118.000000, reward total was -16.000000.

episode 3217.000000, reward total was -7.000000. running mean: -7.588606
episode 3218.000000, reward total was -9.000000. running mean: -7.602720
episode 3219.000000, reward total was -3.000000. running mean: -7.556692
episode 3220.000000, reward total was -5.000000. running mean: -7.531126
episode 3221.000000, reward total was -11.000000. running mean: -7.565814
episode 3222.000000, reward total was -12.000000. running mean: -7.610156
episode 3223.000000, reward total was -1.000000. running mean: -7.544055
episode 3224.000000, reward total was -11.000000. running mean: -7.578614
episode 3225.000000, reward total was -3.000000. running mean: -7.532828
episode 3226.000000, reward total was -2.000000. running mean: -7.477500
episode 3227.000000, reward total was -9.000000. running mean: -7.492725
episode 3228.000000, reward total was -7.000000. running mean: -7.487797
episode 3229.000000, reward total was -16.000000. running mean: -7.572919
episode 3230.000000, reward total was -9.000000

episode 3330.000000, reward total was -11.000000. running mean: -6.495928
episode 3331.000000, reward total was -5.000000. running mean: -6.480969
episode 3332.000000, reward total was -3.000000. running mean: -6.446159
episode 3333.000000, reward total was -14.000000. running mean: -6.521698
episode 3334.000000, reward total was -7.000000. running mean: -6.526481
episode 3335.000000, reward total was 3.000000. running mean: -6.431216
episode 3336.000000, reward total was -5.000000. running mean: -6.416904
episode 3337.000000, reward total was -8.000000. running mean: -6.432735
episode 3338.000000, reward total was -11.000000. running mean: -6.478407
episode 3339.000000, reward total was -8.000000. running mean: -6.493623
episode 3340.000000, reward total was -9.000000. running mean: -6.518687
episode 3341.000000, reward total was -3.000000. running mean: -6.483500
episode 3342.000000, reward total was -3.000000. running mean: -6.448665
episode 3343.000000, reward total was -5.000000. 

episode 3443.000000, reward total was -1.000000. running mean: -5.899052
episode 3444.000000, reward total was -5.000000. running mean: -5.890061
episode 3445.000000, reward total was -10.000000. running mean: -5.931160
episode 3446.000000, reward total was -9.000000. running mean: -5.961849
episode 3447.000000, reward total was -11.000000. running mean: -6.012230
episode 3448.000000, reward total was -11.000000. running mean: -6.062108
episode 3449.000000, reward total was -8.000000. running mean: -6.081487
episode 3450.000000, reward total was -11.000000. running mean: -6.130672
episode 3451.000000, reward total was -7.000000. running mean: -6.139365
episode 3452.000000, reward total was -13.000000. running mean: -6.207972
episode 3453.000000, reward total was -4.000000. running mean: -6.185892
episode 3454.000000, reward total was -3.000000. running mean: -6.154033
episode 3455.000000, reward total was -6.000000. running mean: -6.152493
episode 3456.000000, reward total was 5.000000

episode 3555.000000, reward total was 6.000000. running mean: -6.328152
episode 3556.000000, reward total was -8.000000. running mean: -6.344871
episode 3557.000000, reward total was 2.000000. running mean: -6.261422
episode 3558.000000, reward total was -9.000000. running mean: -6.288808
episode 3559.000000, reward total was 3.000000. running mean: -6.195920
episode 3560.000000, reward total was -15.000000. running mean: -6.283960
episode 3561.000000, reward total was -7.000000. running mean: -6.291121
episode 3562.000000, reward total was 7.000000. running mean: -6.158210
episode 3563.000000, reward total was -11.000000. running mean: -6.206627
episode 3564.000000, reward total was -8.000000. running mean: -6.224561
episode 3565.000000, reward total was -7.000000. running mean: -6.232316
episode 3566.000000, reward total was -13.000000. running mean: -6.299992
episode 3567.000000, reward total was -9.000000. running mean: -6.326993
episode 3568.000000, reward total was -2.000000. run

episode 3667.000000, reward total was -3.000000. running mean: -6.678996
episode 3668.000000, reward total was -8.000000. running mean: -6.692206
episode 3669.000000, reward total was -8.000000. running mean: -6.705284
episode 3670.000000, reward total was -11.000000. running mean: -6.748231
episode 3671.000000, reward total was -9.000000. running mean: -6.770748
episode 3672.000000, reward total was -3.000000. running mean: -6.733041
episode 3673.000000, reward total was -10.000000. running mean: -6.765711
episode 3674.000000, reward total was -9.000000. running mean: -6.788053
episode 3675.000000, reward total was -12.000000. running mean: -6.840173
episode 3676.000000, reward total was 1.000000. running mean: -6.761771
episode 3677.000000, reward total was -4.000000. running mean: -6.734153
episode 3678.000000, reward total was -1.000000. running mean: -6.676812
episode 3679.000000, reward total was -11.000000. running mean: -6.720044
episode 3680.000000, reward total was -3.000000.

episode 3780.000000, reward total was -5.000000. running mean: -5.621590
episode 3781.000000, reward total was -15.000000. running mean: -5.715374
episode 3782.000000, reward total was 8.000000. running mean: -5.578221
episode 3783.000000, reward total was -8.000000. running mean: -5.602438
episode 3784.000000, reward total was -10.000000. running mean: -5.646414
episode 3785.000000, reward total was -9.000000. running mean: -5.679950
episode 3786.000000, reward total was -9.000000. running mean: -5.713150
episode 3787.000000, reward total was 5.000000. running mean: -5.606019
episode 3788.000000, reward total was 7.000000. running mean: -5.479959
episode 3789.000000, reward total was -2.000000. running mean: -5.445159
episode 3790.000000, reward total was -5.000000. running mean: -5.440707
episode 3791.000000, reward total was 3.000000. running mean: -5.356300
episode 3792.000000, reward total was -8.000000. running mean: -5.382737
episode 3793.000000, reward total was -4.000000. runn

episode 3893.000000, reward total was -11.000000. running mean: -5.687503
episode 3894.000000, reward total was 10.000000. running mean: -5.530628
episode 3895.000000, reward total was -11.000000. running mean: -5.585322
episode 3896.000000, reward total was -8.000000. running mean: -5.609468
episode 3897.000000, reward total was 5.000000. running mean: -5.503374
episode 3898.000000, reward total was -11.000000. running mean: -5.558340
episode 3899.000000, reward total was -9.000000. running mean: -5.592757
episode 3900.000000, reward total was -9.000000. running mean: -5.626829
episode 3901.000000, reward total was -4.000000. running mean: -5.610561
episode 3902.000000, reward total was -11.000000. running mean: -5.664455
episode 3903.000000, reward total was 1.000000. running mean: -5.597811
episode 3904.000000, reward total was -3.000000. running mean: -5.571832
episode 3905.000000, reward total was -3.000000. running mean: -5.546114
episode 3906.000000, reward total was -5.000000. 

episode 4006.000000, reward total was -5.000000. running mean: -5.583866
episode 4007.000000, reward total was -13.000000. running mean: -5.658028
episode 4008.000000, reward total was -5.000000. running mean: -5.651448
episode 4009.000000, reward total was -14.000000. running mean: -5.734933
episode 4010.000000, reward total was -7.000000. running mean: -5.747584
episode 4011.000000, reward total was 3.000000. running mean: -5.660108
episode 4012.000000, reward total was -1.000000. running mean: -5.613507
episode 4013.000000, reward total was -5.000000. running mean: -5.607372
episode 4014.000000, reward total was -13.000000. running mean: -5.681298
episode 4015.000000, reward total was -3.000000. running mean: -5.654485
episode 4016.000000, reward total was -15.000000. running mean: -5.747940
episode 4017.000000, reward total was -6.000000. running mean: -5.750461
episode 4018.000000, reward total was -11.000000. running mean: -5.802956
episode 4019.000000, reward total was 1.000000.

episode 4119.000000, reward total was -14.000000. running mean: -5.674077
episode 4120.000000, reward total was -7.000000. running mean: -5.687336
episode 4121.000000, reward total was -1.000000. running mean: -5.640463
episode 4122.000000, reward total was 1.000000. running mean: -5.574058
episode 4123.000000, reward total was 5.000000. running mean: -5.468317
episode 4124.000000, reward total was 5.000000. running mean: -5.363634
episode 4125.000000, reward total was -7.000000. running mean: -5.379998
episode 4126.000000, reward total was -4.000000. running mean: -5.366198
episode 4127.000000, reward total was -4.000000. running mean: -5.352536
episode 4128.000000, reward total was -2.000000. running mean: -5.319011
episode 4129.000000, reward total was -9.000000. running mean: -5.355820
episode 4130.000000, reward total was -8.000000. running mean: -5.382262
episode 4131.000000, reward total was -5.000000. running mean: -5.378440
episode 4132.000000, reward total was 1.000000. runni

episode 4232.000000, reward total was 5.000000. running mean: -4.171542
episode 4233.000000, reward total was -9.000000. running mean: -4.219827
episode 4234.000000, reward total was 8.000000. running mean: -4.097629
episode 4235.000000, reward total was -4.000000. running mean: -4.096652
episode 4236.000000, reward total was 7.000000. running mean: -3.985686
episode 4237.000000, reward total was -7.000000. running mean: -4.015829
episode 4238.000000, reward total was -9.000000. running mean: -4.065671
episode 4239.000000, reward total was -10.000000. running mean: -4.125014
episode 4240.000000, reward total was -10.000000. running mean: -4.183764
episode 4241.000000, reward total was -5.000000. running mean: -4.191926
episode 4242.000000, reward total was -8.000000. running mean: -4.230007
episode 4243.000000, reward total was -7.000000. running mean: -4.257707
episode 4244.000000, reward total was -9.000000. running mean: -4.305130
episode 4245.000000, reward total was -3.000000. run

episode 4345.000000, reward total was -9.000000. running mean: -4.697634
episode 4346.000000, reward total was -3.000000. running mean: -4.680657
episode 4347.000000, reward total was 2.000000. running mean: -4.613851
episode 4348.000000, reward total was -15.000000. running mean: -4.717712
episode 4349.000000, reward total was -12.000000. running mean: -4.790535
episode 4350.000000, reward total was -4.000000. running mean: -4.782630
episode 4351.000000, reward total was -7.000000. running mean: -4.804803
episode 4352.000000, reward total was -12.000000. running mean: -4.876755
episode 4353.000000, reward total was -16.000000. running mean: -4.987988
episode 4354.000000, reward total was -10.000000. running mean: -5.038108
episode 4355.000000, reward total was -9.000000. running mean: -5.077727
episode 4356.000000, reward total was -9.000000. running mean: -5.116950
episode 4357.000000, reward total was -2.000000. running mean: -5.085780
episode 4358.000000, reward total was 1.000000.

episode 4458.000000, reward total was -11.000000. running mean: -5.272429
episode 4459.000000, reward total was -7.000000. running mean: -5.289705
episode 4460.000000, reward total was -3.000000. running mean: -5.266808
episode 4461.000000, reward total was -11.000000. running mean: -5.324140
episode 4462.000000, reward total was -11.000000. running mean: -5.380898
episode 4463.000000, reward total was -8.000000. running mean: -5.407089
episode 4464.000000, reward total was -10.000000. running mean: -5.453019
episode 4465.000000, reward total was 9.000000. running mean: -5.308488
episode 4466.000000, reward total was -10.000000. running mean: -5.355404
episode 4467.000000, reward total was -2.000000. running mean: -5.321849
episode 4468.000000, reward total was -7.000000. running mean: -5.338631
episode 4469.000000, reward total was -1.000000. running mean: -5.295245
episode 4470.000000, reward total was -4.000000. running mean: -5.282292
episode 4471.000000, reward total was -9.000000

episode 4571.000000, reward total was -11.000000. running mean: -4.095061
episode 4572.000000, reward total was 1.000000. running mean: -4.044110
episode 4573.000000, reward total was -3.000000. running mean: -4.033669
episode 4574.000000, reward total was -1.000000. running mean: -4.003332
episode 4575.000000, reward total was 6.000000. running mean: -3.903299
episode 4576.000000, reward total was -5.000000. running mean: -3.914266
episode 4577.000000, reward total was -4.000000. running mean: -3.915123
episode 4578.000000, reward total was -9.000000. running mean: -3.965972
episode 4579.000000, reward total was 6.000000. running mean: -3.866313
episode 4580.000000, reward total was -3.000000. running mean: -3.857649
episode 4581.000000, reward total was -4.000000. running mean: -3.859073
episode 4582.000000, reward total was -1.000000. running mean: -3.830482
episode 4583.000000, reward total was -7.000000. running mean: -3.862177
episode 4584.000000, reward total was -11.000000. run

episode 4684.000000, reward total was -3.000000. running mean: -4.080534
episode 4685.000000, reward total was -2.000000. running mean: -4.059729
episode 4686.000000, reward total was -4.000000. running mean: -4.059132
episode 4687.000000, reward total was -8.000000. running mean: -4.098540
episode 4688.000000, reward total was -6.000000. running mean: -4.117555
episode 4689.000000, reward total was -7.000000. running mean: -4.146379
episode 4690.000000, reward total was -13.000000. running mean: -4.234916
episode 4691.000000, reward total was 3.000000. running mean: -4.162566
episode 4692.000000, reward total was -14.000000. running mean: -4.260941
episode 4693.000000, reward total was 2.000000. running mean: -4.198331
episode 4694.000000, reward total was -6.000000. running mean: -4.216348
episode 4695.000000, reward total was -9.000000. running mean: -4.264185
episode 4696.000000, reward total was -3.000000. running mean: -4.251543
episode 4697.000000, reward total was -15.000000. r

episode 4797.000000, reward total was -1.000000. running mean: -4.461964
episode 4798.000000, reward total was -5.000000. running mean: -4.467344
episode 4799.000000, reward total was -7.000000. running mean: -4.492671
episode 4800.000000, reward total was -3.000000. running mean: -4.477744
episode 4801.000000, reward total was 5.000000. running mean: -4.382966
episode 4802.000000, reward total was -1.000000. running mean: -4.349137
episode 4803.000000, reward total was 4.000000. running mean: -4.265645
episode 4804.000000, reward total was -2.000000. running mean: -4.242989
episode 4805.000000, reward total was -1.000000. running mean: -4.210559
episode 4806.000000, reward total was -11.000000. running mean: -4.278453
episode 4807.000000, reward total was -8.000000. running mean: -4.315669
episode 4808.000000, reward total was -1.000000. running mean: -4.282512
episode 4809.000000, reward total was -7.000000. running mean: -4.309687
episode 4810.000000, reward total was -3.000000. run

episode 4910.000000, reward total was -5.000000. running mean: -4.298689
episode 4911.000000, reward total was -3.000000. running mean: -4.285702
episode 4912.000000, reward total was 9.000000. running mean: -4.152845
episode 4913.000000, reward total was -7.000000. running mean: -4.181317
episode 4914.000000, reward total was -7.000000. running mean: -4.209504
episode 4915.000000, reward total was 3.000000. running mean: -4.137409
episode 4916.000000, reward total was 2.000000. running mean: -4.076035
episode 4917.000000, reward total was -15.000000. running mean: -4.185274
episode 4918.000000, reward total was -4.000000. running mean: -4.183422
episode 4919.000000, reward total was -9.000000. running mean: -4.231587
episode 4920.000000, reward total was -3.000000. running mean: -4.219271
episode 4921.000000, reward total was -2.000000. running mean: -4.197079
episode 4922.000000, reward total was -8.000000. running mean: -4.235108
episode 4923.000000, reward total was -4.000000. runn

episode 5023.000000, reward total was -9.000000. running mean: -4.431442
episode 5024.000000, reward total was 7.000000. running mean: -4.317127
episode 5025.000000, reward total was 4.000000. running mean: -4.233956
episode 5026.000000, reward total was -2.000000. running mean: -4.211617
episode 5027.000000, reward total was -4.000000. running mean: -4.209500
episode 5028.000000, reward total was -15.000000. running mean: -4.317405
episode 5029.000000, reward total was -11.000000. running mean: -4.384231
episode 5030.000000, reward total was 6.000000. running mean: -4.280389
episode 5031.000000, reward total was -3.000000. running mean: -4.267585
episode 5032.000000, reward total was -4.000000. running mean: -4.264909
episode 5033.000000, reward total was -9.000000. running mean: -4.312260
episode 5034.000000, reward total was -7.000000. running mean: -4.339138
episode 5035.000000, reward total was -5.000000. running mean: -4.345746
episode 5036.000000, reward total was -10.000000. ru

episode 5136.000000, reward total was -1.000000. running mean: -4.319959
episode 5137.000000, reward total was -9.000000. running mean: -4.366759
episode 5138.000000, reward total was -3.000000. running mean: -4.353092
episode 5139.000000, reward total was -7.000000. running mean: -4.379561
episode 5140.000000, reward total was -2.000000. running mean: -4.355765
episode 5141.000000, reward total was -11.000000. running mean: -4.422207
episode 5142.000000, reward total was 1.000000. running mean: -4.367985
episode 5143.000000, reward total was 7.000000. running mean: -4.254305
episode 5144.000000, reward total was -8.000000. running mean: -4.291762
episode 5145.000000, reward total was -9.000000. running mean: -4.338845
episode 5146.000000, reward total was -6.000000. running mean: -4.355456
episode 5147.000000, reward total was 7.000000. running mean: -4.241902
episode 5148.000000, reward total was -6.000000. running mean: -4.259483
episode 5149.000000, reward total was -6.000000. runn

episode 5249.000000, reward total was -4.000000. running mean: -4.201580
episode 5250.000000, reward total was 4.000000. running mean: -4.119565
episode 5251.000000, reward total was -9.000000. running mean: -4.168369
episode 5252.000000, reward total was -9.000000. running mean: -4.216685
episode 5253.000000, reward total was -13.000000. running mean: -4.304518
episode 5254.000000, reward total was -15.000000. running mean: -4.411473
episode 5255.000000, reward total was -2.000000. running mean: -4.387358
episode 5256.000000, reward total was -8.000000. running mean: -4.423485
episode 5257.000000, reward total was 5.000000. running mean: -4.329250
episode 5258.000000, reward total was -8.000000. running mean: -4.365958
episode 5259.000000, reward total was -1.000000. running mean: -4.332298
episode 5260.000000, reward total was -5.000000. running mean: -4.338975
episode 5261.000000, reward total was -8.000000. running mean: -4.375585
episode 5262.000000, reward total was 8.000000. run

episode 5362.000000, reward total was -10.000000. running mean: -4.873963
episode 5363.000000, reward total was -12.000000. running mean: -4.945223
episode 5364.000000, reward total was -8.000000. running mean: -4.975771
episode 5365.000000, reward total was -11.000000. running mean: -5.036013
episode 5366.000000, reward total was -9.000000. running mean: -5.075653
episode 5367.000000, reward total was -13.000000. running mean: -5.154896
episode 5368.000000, reward total was -9.000000. running mean: -5.193347
episode 5369.000000, reward total was 9.000000. running mean: -5.051414
episode 5370.000000, reward total was -11.000000. running mean: -5.110900
episode 5371.000000, reward total was 3.000000. running mean: -5.029791
episode 5372.000000, reward total was -8.000000. running mean: -5.059493
episode 5373.000000, reward total was -12.000000. running mean: -5.128898
episode 5374.000000, reward total was -7.000000. running mean: -5.147609
episode 5375.000000, reward total was 3.000000.

episode 5475.000000, reward total was -3.000000. running mean: -5.026117
episode 5476.000000, reward total was -16.000000. running mean: -5.135856
episode 5477.000000, reward total was -7.000000. running mean: -5.154497
episode 5478.000000, reward total was 5.000000. running mean: -5.052952
episode 5479.000000, reward total was -5.000000. running mean: -5.052422
episode 5480.000000, reward total was -7.000000. running mean: -5.071898
episode 5481.000000, reward total was -1.000000. running mean: -5.031179
episode 5482.000000, reward total was -3.000000. running mean: -5.010867
episode 5483.000000, reward total was -7.000000. running mean: -5.030759
episode 5484.000000, reward total was -5.000000. running mean: -5.030451
episode 5485.000000, reward total was 7.000000. running mean: -4.910147
episode 5486.000000, reward total was -9.000000. running mean: -4.951045
episode 5487.000000, reward total was -11.000000. running mean: -5.011535
episode 5488.000000, reward total was 7.000000. run

episode 5588.000000, reward total was -11.000000. running mean: -4.261945
episode 5589.000000, reward total was -15.000000. running mean: -4.369326
episode 5590.000000, reward total was -5.000000. running mean: -4.375633
episode 5591.000000, reward total was -11.000000. running mean: -4.441876
episode 5592.000000, reward total was 5.000000. running mean: -4.347458
episode 5593.000000, reward total was -9.000000. running mean: -4.393983
episode 5594.000000, reward total was -5.000000. running mean: -4.400043
episode 5595.000000, reward total was -8.000000. running mean: -4.436043
episode 5596.000000, reward total was 4.000000. running mean: -4.351682
episode 5597.000000, reward total was -11.000000. running mean: -4.418166
episode 5598.000000, reward total was -7.000000. running mean: -4.443984
episode 5599.000000, reward total was -2.000000. running mean: -4.419544
episode 5600.000000, reward total was -9.000000. running mean: -4.465349
episode 5601.000000, reward total was -9.000000. 

episode 5701.000000, reward total was -9.000000. running mean: -3.834414
episode 5702.000000, reward total was -2.000000. running mean: -3.816070
episode 5703.000000, reward total was 1.000000. running mean: -3.767909
episode 5704.000000, reward total was 4.000000. running mean: -3.690230
episode 5705.000000, reward total was -2.000000. running mean: -3.673328
episode 5706.000000, reward total was 7.000000. running mean: -3.566594
episode 5707.000000, reward total was -2.000000. running mean: -3.550928
episode 5708.000000, reward total was -14.000000. running mean: -3.655419
episode 5709.000000, reward total was -2.000000. running mean: -3.638865
episode 5710.000000, reward total was -9.000000. running mean: -3.692476
episode 5711.000000, reward total was -13.000000. running mean: -3.785552
episode 5712.000000, reward total was 2.000000. running mean: -3.727696
episode 5713.000000, reward total was -6.000000. running mean: -3.750419
episode 5714.000000, reward total was 9.000000. runni

episode 5814.000000, reward total was -5.000000. running mean: -4.043714
episode 5815.000000, reward total was -7.000000. running mean: -4.073277
episode 5816.000000, reward total was 2.000000. running mean: -4.012544
episode 5817.000000, reward total was -11.000000. running mean: -4.082419
episode 5818.000000, reward total was -8.000000. running mean: -4.121594
episode 5819.000000, reward total was 4.000000. running mean: -4.040378
episode 5820.000000, reward total was -2.000000. running mean: -4.019975
episode 5821.000000, reward total was -1.000000. running mean: -3.989775
episode 5822.000000, reward total was -4.000000. running mean: -3.989877
episode 5823.000000, reward total was 6.000000. running mean: -3.889978
episode 5824.000000, reward total was -1.000000. running mean: -3.861079
episode 5825.000000, reward total was -9.000000. running mean: -3.912468
episode 5826.000000, reward total was 3.000000. running mean: -3.843343
episode 5827.000000, reward total was -7.000000. runni

episode 5927.000000, reward total was -16.000000. running mean: -3.273561
episode 5928.000000, reward total was -6.000000. running mean: -3.300825
episode 5929.000000, reward total was 7.000000. running mean: -3.197817
episode 5930.000000, reward total was -1.000000. running mean: -3.175839
episode 5931.000000, reward total was 2.000000. running mean: -3.124080
episode 5932.000000, reward total was 9.000000. running mean: -3.002840
episode 5933.000000, reward total was -4.000000. running mean: -3.012811
episode 5934.000000, reward total was -13.000000. running mean: -3.112683
episode 5935.000000, reward total was -8.000000. running mean: -3.161556
episode 5936.000000, reward total was -14.000000. running mean: -3.269941
episode 5937.000000, reward total was -15.000000. running mean: -3.387241
episode 5938.000000, reward total was 10.000000. running mean: -3.253369
episode 5939.000000, reward total was -7.000000. running mean: -3.290835
episode 5940.000000, reward total was -2.000000. r

In [5]:
play_game(env, model)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  "The argument mode in render method is deprecated; "
  "No render fps was declared in the environment (env.metadata['render_fps'] is None or not defined), rendering may occur at inconsistent fps."


Episode finished without success, accumulated reward = 0.0
