In [1]:
import gym
import numpy as np
rm='Pong-v0'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make(rm)

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-2
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  logger.warn(
  deprecation(
  deprecation(


In [3]:
%time hist1 = train_model(env, model, total_episodes=6000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -21.000000. running mean: -21.000000
episode 3.000000, reward total was -19.000000. running mean: -20.980000
episode 4.000000, reward total was -19.000000. running mean: -20.960200
episode 5.000000, reward total was -21.000000. running mean: -20.960598
episode 6.000000, reward total was -21.000000. running mean: -20.960992
episode 7.000000, reward total was -20.000000. running mean: -20.951382
episode 8.000000, reward total was -19.000000. running mean: -20.931868
episode 9.000000, reward total was -20.000000. running mean: -20.922550
episode 10.000000, reward total was -21.000000. running mean: -20.923324
episode 11.000000, reward total was -21.000000. running mean: -20.924091
episode 12.000000, reward total was -21.000000. running mean: -20.924850
episode 13.000000, reward total was -21.000000. running mean: -20.925601
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.819865
episode 115.000000, reward total was -21.000000. running mean: -20.821666
episode 116.000000, reward total was -21.000000. running mean: -20.823450
episode 117.000000, reward total was -21.000000. running mean: -20.825215
episode 118.000000, reward total was -21.000000. running mean: -20.826963
episode 119.000000, reward total was -21.000000. running mean: -20.828693
episode 120.000000, reward total was -21.000000. running mean: -20.830406
episode 121.000000, reward total was -20.000000. running mean: -20.822102
episode 122.000000, reward total was -21.000000. running mean: -20.823881
episode 123.000000, reward total was -21.000000. running mean: -20.825642
episode 124.000000, reward total was -20.000000. running mean: -20.817386
episode 125.000000, reward total was -21.000000. running mean: -20.819212
episode 126.000000, reward total was -20.000000. running mean: -20.811020
episode 127.000000, reward total was -

episode 225.000000, reward total was -20.000000. running mean: -20.552760
episode 226.000000, reward total was -20.000000. running mean: -20.547232
episode 227.000000, reward total was -20.000000. running mean: -20.541760
episode 228.000000, reward total was -21.000000. running mean: -20.546342
episode 229.000000, reward total was -21.000000. running mean: -20.550879
episode 230.000000, reward total was -21.000000. running mean: -20.555370
episode 231.000000, reward total was -20.000000. running mean: -20.549817
episode 232.000000, reward total was -19.000000. running mean: -20.534318
episode 233.000000, reward total was -20.000000. running mean: -20.528975
episode 234.000000, reward total was -21.000000. running mean: -20.533685
episode 235.000000, reward total was -18.000000. running mean: -20.508349
episode 236.000000, reward total was -20.000000. running mean: -20.503265
episode 237.000000, reward total was -21.000000. running mean: -20.508232
episode 238.000000, reward total was -

episode 336.000000, reward total was -18.000000. running mean: -20.092621
episode 337.000000, reward total was -21.000000. running mean: -20.101695
episode 338.000000, reward total was -20.000000. running mean: -20.100678
episode 339.000000, reward total was -21.000000. running mean: -20.109671
episode 340.000000, reward total was -21.000000. running mean: -20.118574
episode 341.000000, reward total was -21.000000. running mean: -20.127389
episode 342.000000, reward total was -19.000000. running mean: -20.116115
episode 343.000000, reward total was -19.000000. running mean: -20.104954
episode 344.000000, reward total was -18.000000. running mean: -20.083904
episode 345.000000, reward total was -18.000000. running mean: -20.063065
episode 346.000000, reward total was -21.000000. running mean: -20.072434
episode 347.000000, reward total was -20.000000. running mean: -20.071710
episode 348.000000, reward total was -19.000000. running mean: -20.060993
episode 349.000000, reward total was -

episode 447.000000, reward total was -17.000000. running mean: -19.459202
episode 448.000000, reward total was -18.000000. running mean: -19.444610
episode 449.000000, reward total was -21.000000. running mean: -19.460164
episode 450.000000, reward total was -19.000000. running mean: -19.455562
episode 451.000000, reward total was -19.000000. running mean: -19.451007
episode 452.000000, reward total was -19.000000. running mean: -19.446497
episode 453.000000, reward total was -17.000000. running mean: -19.422032
episode 454.000000, reward total was -17.000000. running mean: -19.397811
episode 455.000000, reward total was -16.000000. running mean: -19.363833
episode 456.000000, reward total was -21.000000. running mean: -19.380195
episode 457.000000, reward total was -19.000000. running mean: -19.376393
episode 458.000000, reward total was -17.000000. running mean: -19.352629
episode 459.000000, reward total was -21.000000. running mean: -19.369103
episode 460.000000, reward total was -

episode 558.000000, reward total was -21.000000. running mean: -18.949084
episode 559.000000, reward total was -21.000000. running mean: -18.969593
episode 560.000000, reward total was -19.000000. running mean: -18.969897
episode 561.000000, reward total was -19.000000. running mean: -18.970198
episode 562.000000, reward total was -21.000000. running mean: -18.990496
episode 563.000000, reward total was -19.000000. running mean: -18.990591
episode 564.000000, reward total was -20.000000. running mean: -19.000685
episode 565.000000, reward total was -20.000000. running mean: -19.010678
episode 566.000000, reward total was -20.000000. running mean: -19.020571
episode 567.000000, reward total was -19.000000. running mean: -19.020366
episode 568.000000, reward total was -19.000000. running mean: -19.020162
episode 569.000000, reward total was -19.000000. running mean: -19.019960
episode 570.000000, reward total was -21.000000. running mean: -19.039761
episode 571.000000, reward total was -

episode 669.000000, reward total was -17.000000. running mean: -19.139708
episode 670.000000, reward total was -18.000000. running mean: -19.128311
episode 671.000000, reward total was -17.000000. running mean: -19.107028
episode 672.000000, reward total was -19.000000. running mean: -19.105958
episode 673.000000, reward total was -19.000000. running mean: -19.104898
episode 674.000000, reward total was -20.000000. running mean: -19.113849
episode 675.000000, reward total was -21.000000. running mean: -19.132711
episode 676.000000, reward total was -15.000000. running mean: -19.091384
episode 677.000000, reward total was -21.000000. running mean: -19.110470
episode 678.000000, reward total was -17.000000. running mean: -19.089365
episode 679.000000, reward total was -18.000000. running mean: -19.078471
episode 680.000000, reward total was -19.000000. running mean: -19.077687
episode 681.000000, reward total was -19.000000. running mean: -19.076910
episode 682.000000, reward total was -

episode 780.000000, reward total was -21.000000. running mean: -19.182210
episode 781.000000, reward total was -20.000000. running mean: -19.190388
episode 782.000000, reward total was -20.000000. running mean: -19.198484
episode 783.000000, reward total was -19.000000. running mean: -19.196499
episode 784.000000, reward total was -20.000000. running mean: -19.204534
episode 785.000000, reward total was -19.000000. running mean: -19.202488
episode 786.000000, reward total was -17.000000. running mean: -19.180464
episode 787.000000, reward total was -21.000000. running mean: -19.198659
episode 788.000000, reward total was -20.000000. running mean: -19.206672
episode 789.000000, reward total was -20.000000. running mean: -19.214606
episode 790.000000, reward total was -21.000000. running mean: -19.232460
episode 791.000000, reward total was -19.000000. running mean: -19.230135
episode 792.000000, reward total was -20.000000. running mean: -19.237834
episode 793.000000, reward total was -

episode 891.000000, reward total was -21.000000. running mean: -19.099495
episode 892.000000, reward total was -21.000000. running mean: -19.118501
episode 893.000000, reward total was -20.000000. running mean: -19.127316
episode 894.000000, reward total was -17.000000. running mean: -19.106042
episode 895.000000, reward total was -17.000000. running mean: -19.084982
episode 896.000000, reward total was -19.000000. running mean: -19.084132
episode 897.000000, reward total was -18.000000. running mean: -19.073291
episode 898.000000, reward total was -17.000000. running mean: -19.052558
episode 899.000000, reward total was -16.000000. running mean: -19.022032
episode 900.000000, reward total was -14.000000. running mean: -18.971812
episode 901.000000, reward total was -18.000000. running mean: -18.962094
episode 902.000000, reward total was -20.000000. running mean: -18.972473
episode 903.000000, reward total was -13.000000. running mean: -18.912748
episode 904.000000, reward total was -

episode 1002.000000, reward total was -16.000000. running mean: -18.208797
episode 1003.000000, reward total was -20.000000. running mean: -18.226709
episode 1004.000000, reward total was -16.000000. running mean: -18.204442
episode 1005.000000, reward total was -21.000000. running mean: -18.232398
episode 1006.000000, reward total was -20.000000. running mean: -18.250074
episode 1007.000000, reward total was -17.000000. running mean: -18.237573
episode 1008.000000, reward total was -17.000000. running mean: -18.225197
episode 1009.000000, reward total was -14.000000. running mean: -18.182945
episode 1010.000000, reward total was -17.000000. running mean: -18.171116
episode 1011.000000, reward total was -15.000000. running mean: -18.139405
episode 1012.000000, reward total was -13.000000. running mean: -18.088011
episode 1013.000000, reward total was -20.000000. running mean: -18.107131
episode 1014.000000, reward total was -19.000000. running mean: -18.116059
episode 1015.000000, rewa

episode 1112.000000, reward total was -16.000000. running mean: -18.069563
episode 1113.000000, reward total was -17.000000. running mean: -18.058868
episode 1114.000000, reward total was -19.000000. running mean: -18.068279
episode 1115.000000, reward total was -17.000000. running mean: -18.057596
episode 1116.000000, reward total was -17.000000. running mean: -18.047020
episode 1117.000000, reward total was -20.000000. running mean: -18.066550
episode 1118.000000, reward total was -19.000000. running mean: -18.075885
episode 1119.000000, reward total was -20.000000. running mean: -18.095126
episode 1120.000000, reward total was -18.000000. running mean: -18.094174
episode 1121.000000, reward total was -19.000000. running mean: -18.103233
episode 1122.000000, reward total was -16.000000. running mean: -18.082200
episode 1123.000000, reward total was -16.000000. running mean: -18.061378
episode 1124.000000, reward total was -20.000000. running mean: -18.080765
episode 1125.000000, rewa

episode 1222.000000, reward total was -18.000000. running mean: -17.838142
episode 1223.000000, reward total was -13.000000. running mean: -17.789760
episode 1224.000000, reward total was -16.000000. running mean: -17.771863
episode 1225.000000, reward total was -20.000000. running mean: -17.794144
episode 1226.000000, reward total was -20.000000. running mean: -17.816203
episode 1227.000000, reward total was -16.000000. running mean: -17.798041
episode 1228.000000, reward total was -15.000000. running mean: -17.770060
episode 1229.000000, reward total was -19.000000. running mean: -17.782360
episode 1230.000000, reward total was -19.000000. running mean: -17.794536
episode 1231.000000, reward total was -18.000000. running mean: -17.796591
episode 1232.000000, reward total was -17.000000. running mean: -17.788625
episode 1233.000000, reward total was -19.000000. running mean: -17.800739
episode 1234.000000, reward total was -19.000000. running mean: -17.812731
episode 1235.000000, rewa

episode 1332.000000, reward total was -19.000000. running mean: -17.883845
episode 1333.000000, reward total was -17.000000. running mean: -17.875007
episode 1334.000000, reward total was -17.000000. running mean: -17.866257
episode 1335.000000, reward total was -19.000000. running mean: -17.877594
episode 1336.000000, reward total was -18.000000. running mean: -17.878818
episode 1337.000000, reward total was -19.000000. running mean: -17.890030
episode 1338.000000, reward total was -20.000000. running mean: -17.911130
episode 1339.000000, reward total was -20.000000. running mean: -17.932019
episode 1340.000000, reward total was -18.000000. running mean: -17.932698
episode 1341.000000, reward total was -21.000000. running mean: -17.963371
episode 1342.000000, reward total was -20.000000. running mean: -17.983738
episode 1343.000000, reward total was -21.000000. running mean: -18.013900
episode 1344.000000, reward total was -18.000000. running mean: -18.013761
episode 1345.000000, rewa

episode 1442.000000, reward total was -13.000000. running mean: -17.874977
episode 1443.000000, reward total was -13.000000. running mean: -17.826227
episode 1444.000000, reward total was -19.000000. running mean: -17.837965
episode 1445.000000, reward total was -16.000000. running mean: -17.819585
episode 1446.000000, reward total was -13.000000. running mean: -17.771389
episode 1447.000000, reward total was -18.000000. running mean: -17.773675
episode 1448.000000, reward total was -20.000000. running mean: -17.795938
episode 1449.000000, reward total was -20.000000. running mean: -17.817979
episode 1450.000000, reward total was -15.000000. running mean: -17.789799
episode 1451.000000, reward total was -12.000000. running mean: -17.731901
episode 1452.000000, reward total was -21.000000. running mean: -17.764582
episode 1453.000000, reward total was -19.000000. running mean: -17.776936
episode 1454.000000, reward total was -19.000000. running mean: -17.789167
episode 1455.000000, rewa

episode 1552.000000, reward total was -19.000000. running mean: -18.762668
episode 1553.000000, reward total was -19.000000. running mean: -18.765042
episode 1554.000000, reward total was -21.000000. running mean: -18.787391
episode 1555.000000, reward total was -16.000000. running mean: -18.759517
episode 1556.000000, reward total was -21.000000. running mean: -18.781922
episode 1557.000000, reward total was -21.000000. running mean: -18.804103
episode 1558.000000, reward total was -21.000000. running mean: -18.826062
episode 1559.000000, reward total was -21.000000. running mean: -18.847801
episode 1560.000000, reward total was -21.000000. running mean: -18.869323
episode 1561.000000, reward total was -17.000000. running mean: -18.850630
episode 1562.000000, reward total was -18.000000. running mean: -18.842124
episode 1563.000000, reward total was -19.000000. running mean: -18.843702
episode 1564.000000, reward total was -20.000000. running mean: -18.855265
episode 1565.000000, rewa

episode 1662.000000, reward total was -20.000000. running mean: -18.546487
episode 1663.000000, reward total was -18.000000. running mean: -18.541022
episode 1664.000000, reward total was -15.000000. running mean: -18.505612
episode 1665.000000, reward total was -16.000000. running mean: -18.480556
episode 1666.000000, reward total was -16.000000. running mean: -18.455751
episode 1667.000000, reward total was -21.000000. running mean: -18.481193
episode 1668.000000, reward total was -19.000000. running mean: -18.486381
episode 1669.000000, reward total was -19.000000. running mean: -18.491517
episode 1670.000000, reward total was -15.000000. running mean: -18.456602
episode 1671.000000, reward total was -18.000000. running mean: -18.452036
episode 1672.000000, reward total was -19.000000. running mean: -18.457516
episode 1673.000000, reward total was -20.000000. running mean: -18.472941
episode 1674.000000, reward total was -21.000000. running mean: -18.498211
episode 1675.000000, rewa

episode 1772.000000, reward total was -20.000000. running mean: -18.483679
episode 1773.000000, reward total was -15.000000. running mean: -18.448842
episode 1774.000000, reward total was -16.000000. running mean: -18.424354
episode 1775.000000, reward total was -19.000000. running mean: -18.430110
episode 1776.000000, reward total was -16.000000. running mean: -18.405809
episode 1777.000000, reward total was -19.000000. running mean: -18.411751
episode 1778.000000, reward total was -19.000000. running mean: -18.417633
episode 1779.000000, reward total was -16.000000. running mean: -18.393457
episode 1780.000000, reward total was -18.000000. running mean: -18.389523
episode 1781.000000, reward total was -21.000000. running mean: -18.415627
episode 1782.000000, reward total was -17.000000. running mean: -18.401471
episode 1783.000000, reward total was -19.000000. running mean: -18.407456
episode 1784.000000, reward total was -19.000000. running mean: -18.413382
episode 1785.000000, rewa

episode 1882.000000, reward total was -18.000000. running mean: -18.187474
episode 1883.000000, reward total was -18.000000. running mean: -18.185599
episode 1884.000000, reward total was -21.000000. running mean: -18.213743
episode 1885.000000, reward total was -16.000000. running mean: -18.191606
episode 1886.000000, reward total was -16.000000. running mean: -18.169690
episode 1887.000000, reward total was -17.000000. running mean: -18.157993
episode 1888.000000, reward total was -18.000000. running mean: -18.156413
episode 1889.000000, reward total was -11.000000. running mean: -18.084849
episode 1890.000000, reward total was -19.000000. running mean: -18.094000
episode 1891.000000, reward total was -16.000000. running mean: -18.073060
episode 1892.000000, reward total was -16.000000. running mean: -18.052330
episode 1893.000000, reward total was -15.000000. running mean: -18.021807
episode 1894.000000, reward total was -11.000000. running mean: -17.951588
episode 1895.000000, rewa

episode 1992.000000, reward total was -19.000000. running mean: -17.602592
episode 1993.000000, reward total was -17.000000. running mean: -17.596566
episode 1994.000000, reward total was -18.000000. running mean: -17.600601
episode 1995.000000, reward total was -16.000000. running mean: -17.584595
episode 1996.000000, reward total was -21.000000. running mean: -17.618749
episode 1997.000000, reward total was -17.000000. running mean: -17.612561
episode 1998.000000, reward total was -15.000000. running mean: -17.586436
episode 1999.000000, reward total was -18.000000. running mean: -17.590571
episode 2000.000000, reward total was -14.000000. running mean: -17.554666
episode 2001.000000, reward total was -14.000000. running mean: -17.519119
episode 2002.000000, reward total was -17.000000. running mean: -17.513928
episode 2003.000000, reward total was -16.000000. running mean: -17.498788
episode 2004.000000, reward total was -16.000000. running mean: -17.483801
episode 2005.000000, rewa

episode 2102.000000, reward total was -19.000000. running mean: -17.522986
episode 2103.000000, reward total was -19.000000. running mean: -17.537756
episode 2104.000000, reward total was -18.000000. running mean: -17.542379
episode 2105.000000, reward total was -16.000000. running mean: -17.526955
episode 2106.000000, reward total was -16.000000. running mean: -17.511685
episode 2107.000000, reward total was -15.000000. running mean: -17.486569
episode 2108.000000, reward total was -21.000000. running mean: -17.521703
episode 2109.000000, reward total was -17.000000. running mean: -17.516486
episode 2110.000000, reward total was -16.000000. running mean: -17.501321
episode 2111.000000, reward total was -18.000000. running mean: -17.506308
episode 2112.000000, reward total was -17.000000. running mean: -17.501245
episode 2113.000000, reward total was -19.000000. running mean: -17.516232
episode 2114.000000, reward total was -17.000000. running mean: -17.511070
episode 2115.000000, rewa

episode 2212.000000, reward total was -18.000000. running mean: -17.200650
episode 2213.000000, reward total was -17.000000. running mean: -17.198644
episode 2214.000000, reward total was -17.000000. running mean: -17.196657
episode 2215.000000, reward total was -15.000000. running mean: -17.174691
episode 2216.000000, reward total was -14.000000. running mean: -17.142944
episode 2217.000000, reward total was -14.000000. running mean: -17.111514
episode 2218.000000, reward total was -15.000000. running mean: -17.090399
episode 2219.000000, reward total was -16.000000. running mean: -17.079495
episode 2220.000000, reward total was -17.000000. running mean: -17.078700
episode 2221.000000, reward total was -17.000000. running mean: -17.077913
episode 2222.000000, reward total was -17.000000. running mean: -17.077134
episode 2223.000000, reward total was -21.000000. running mean: -17.116363
episode 2224.000000, reward total was -19.000000. running mean: -17.135199
episode 2225.000000, rewa

episode 2322.000000, reward total was -13.000000. running mean: -16.620859
episode 2323.000000, reward total was -19.000000. running mean: -16.644651
episode 2324.000000, reward total was -17.000000. running mean: -16.648204
episode 2325.000000, reward total was -18.000000. running mean: -16.661722
episode 2326.000000, reward total was -18.000000. running mean: -16.675105
episode 2327.000000, reward total was -14.000000. running mean: -16.648354
episode 2328.000000, reward total was -14.000000. running mean: -16.621870
episode 2329.000000, reward total was -13.000000. running mean: -16.585652
episode 2330.000000, reward total was -15.000000. running mean: -16.569795
episode 2331.000000, reward total was -17.000000. running mean: -16.574097
episode 2332.000000, reward total was -10.000000. running mean: -16.508356
episode 2333.000000, reward total was -14.000000. running mean: -16.483273
episode 2334.000000, reward total was -20.000000. running mean: -16.518440
episode 2335.000000, rewa

episode 2432.000000, reward total was -19.000000. running mean: -16.451291
episode 2433.000000, reward total was -15.000000. running mean: -16.436778
episode 2434.000000, reward total was -16.000000. running mean: -16.432410
episode 2435.000000, reward total was -18.000000. running mean: -16.448086
episode 2436.000000, reward total was -15.000000. running mean: -16.433605
episode 2437.000000, reward total was -19.000000. running mean: -16.459269
episode 2438.000000, reward total was -19.000000. running mean: -16.484676
episode 2439.000000, reward total was -10.000000. running mean: -16.419830
episode 2440.000000, reward total was -15.000000. running mean: -16.405631
episode 2441.000000, reward total was -10.000000. running mean: -16.341575
episode 2442.000000, reward total was -16.000000. running mean: -16.338159
episode 2443.000000, reward total was -17.000000. running mean: -16.344778
episode 2444.000000, reward total was -19.000000. running mean: -16.371330
episode 2445.000000, rewa

episode 2542.000000, reward total was -16.000000. running mean: -16.550422
episode 2543.000000, reward total was -19.000000. running mean: -16.574917
episode 2544.000000, reward total was -21.000000. running mean: -16.619168
episode 2545.000000, reward total was -18.000000. running mean: -16.632977
episode 2546.000000, reward total was -19.000000. running mean: -16.656647
episode 2547.000000, reward total was -18.000000. running mean: -16.670080
episode 2548.000000, reward total was -17.000000. running mean: -16.673380
episode 2549.000000, reward total was -17.000000. running mean: -16.676646
episode 2550.000000, reward total was -19.000000. running mean: -16.699879
episode 2551.000000, reward total was -18.000000. running mean: -16.712880
episode 2552.000000, reward total was -18.000000. running mean: -16.725752
episode 2553.000000, reward total was -14.000000. running mean: -16.698494
episode 2554.000000, reward total was -21.000000. running mean: -16.741509
episode 2555.000000, rewa

episode 2652.000000, reward total was -18.000000. running mean: -16.458979
episode 2653.000000, reward total was -14.000000. running mean: -16.434389
episode 2654.000000, reward total was -17.000000. running mean: -16.440045
episode 2655.000000, reward total was -19.000000. running mean: -16.465645
episode 2656.000000, reward total was -16.000000. running mean: -16.460988
episode 2657.000000, reward total was -15.000000. running mean: -16.446378
episode 2658.000000, reward total was -14.000000. running mean: -16.421915
episode 2659.000000, reward total was -17.000000. running mean: -16.427696
episode 2660.000000, reward total was -16.000000. running mean: -16.423419
episode 2661.000000, reward total was -20.000000. running mean: -16.459184
episode 2662.000000, reward total was -16.000000. running mean: -16.454593
episode 2663.000000, reward total was -18.000000. running mean: -16.470047
episode 2664.000000, reward total was -19.000000. running mean: -16.495346
episode 2665.000000, rewa

episode 2762.000000, reward total was -15.000000. running mean: -16.481124
episode 2763.000000, reward total was -17.000000. running mean: -16.486313
episode 2764.000000, reward total was -11.000000. running mean: -16.431450
episode 2765.000000, reward total was -20.000000. running mean: -16.467135
episode 2766.000000, reward total was -19.000000. running mean: -16.492464
episode 2767.000000, reward total was -18.000000. running mean: -16.507539
episode 2768.000000, reward total was -19.000000. running mean: -16.532464
episode 2769.000000, reward total was -16.000000. running mean: -16.527139
episode 2770.000000, reward total was -13.000000. running mean: -16.491868
episode 2771.000000, reward total was -19.000000. running mean: -16.516949
episode 2772.000000, reward total was -17.000000. running mean: -16.521779
episode 2773.000000, reward total was -18.000000. running mean: -16.536562
episode 2774.000000, reward total was -15.000000. running mean: -16.521196
episode 2775.000000, rewa

episode 2872.000000, reward total was -15.000000. running mean: -16.309608
episode 2873.000000, reward total was -17.000000. running mean: -16.316512
episode 2874.000000, reward total was -18.000000. running mean: -16.333347
episode 2875.000000, reward total was -13.000000. running mean: -16.300013
episode 2876.000000, reward total was -15.000000. running mean: -16.287013
episode 2877.000000, reward total was -21.000000. running mean: -16.334143
episode 2878.000000, reward total was -9.000000. running mean: -16.260802
episode 2879.000000, reward total was -17.000000. running mean: -16.268194
episode 2880.000000, reward total was -14.000000. running mean: -16.245512
episode 2881.000000, reward total was -11.000000. running mean: -16.193057
episode 2882.000000, reward total was -15.000000. running mean: -16.181126
episode 2883.000000, reward total was -12.000000. running mean: -16.139315
episode 2884.000000, reward total was -16.000000. running mean: -16.137922
episode 2885.000000, rewar

episode 2982.000000, reward total was -17.000000. running mean: -15.819667
episode 2983.000000, reward total was -17.000000. running mean: -15.831470
episode 2984.000000, reward total was -19.000000. running mean: -15.863155
episode 2985.000000, reward total was -17.000000. running mean: -15.874524
episode 2986.000000, reward total was -18.000000. running mean: -15.895779
episode 2987.000000, reward total was -19.000000. running mean: -15.926821
episode 2988.000000, reward total was -10.000000. running mean: -15.867553
episode 2989.000000, reward total was -17.000000. running mean: -15.878877
episode 2990.000000, reward total was -13.000000. running mean: -15.850088
episode 2991.000000, reward total was -18.000000. running mean: -15.871587
episode 2992.000000, reward total was -13.000000. running mean: -15.842872
episode 2993.000000, reward total was -16.000000. running mean: -15.844443
episode 2994.000000, reward total was -14.000000. running mean: -15.825998
episode 2995.000000, rewa

episode 3092.000000, reward total was -18.000000. running mean: -15.437677
episode 3093.000000, reward total was -13.000000. running mean: -15.413300
episode 3094.000000, reward total was -16.000000. running mean: -15.419167
episode 3095.000000, reward total was -14.000000. running mean: -15.404975
episode 3096.000000, reward total was -19.000000. running mean: -15.440926
episode 3097.000000, reward total was -16.000000. running mean: -15.446516
episode 3098.000000, reward total was -15.000000. running mean: -15.442051
episode 3099.000000, reward total was -11.000000. running mean: -15.397631
episode 3100.000000, reward total was -17.000000. running mean: -15.413654
episode 3101.000000, reward total was -14.000000. running mean: -15.399518
episode 3102.000000, reward total was -13.000000. running mean: -15.375523
episode 3103.000000, reward total was -18.000000. running mean: -15.401767
episode 3104.000000, reward total was -17.000000. running mean: -15.417750
episode 3105.000000, rewa

episode 3202.000000, reward total was -11.000000. running mean: -15.396558
episode 3203.000000, reward total was -14.000000. running mean: -15.382592
episode 3204.000000, reward total was -14.000000. running mean: -15.368766
episode 3205.000000, reward total was -11.000000. running mean: -15.325079
episode 3206.000000, reward total was -17.000000. running mean: -15.341828
episode 3207.000000, reward total was -17.000000. running mean: -15.358410
episode 3208.000000, reward total was -19.000000. running mean: -15.394826
episode 3209.000000, reward total was -15.000000. running mean: -15.390877
episode 3210.000000, reward total was -17.000000. running mean: -15.406969
episode 3211.000000, reward total was -12.000000. running mean: -15.372899
episode 3212.000000, reward total was -14.000000. running mean: -15.359170
episode 3213.000000, reward total was -13.000000. running mean: -15.335578
episode 3214.000000, reward total was -16.000000. running mean: -15.342222
episode 3215.000000, rewa

episode 3312.000000, reward total was -15.000000. running mean: -15.036221
episode 3313.000000, reward total was -14.000000. running mean: -15.025859
episode 3314.000000, reward total was -14.000000. running mean: -15.015600
episode 3315.000000, reward total was -17.000000. running mean: -15.035444
episode 3316.000000, reward total was -16.000000. running mean: -15.045090
episode 3317.000000, reward total was -14.000000. running mean: -15.034639
episode 3318.000000, reward total was -17.000000. running mean: -15.054292
episode 3319.000000, reward total was -17.000000. running mean: -15.073749
episode 3320.000000, reward total was -15.000000. running mean: -15.073012
episode 3321.000000, reward total was -13.000000. running mean: -15.052282
episode 3322.000000, reward total was -11.000000. running mean: -15.011759
episode 3323.000000, reward total was -10.000000. running mean: -14.961641
episode 3324.000000, reward total was -15.000000. running mean: -14.962025
episode 3325.000000, rewa

episode 3422.000000, reward total was -15.000000. running mean: -14.125975
episode 3423.000000, reward total was -13.000000. running mean: -14.114715
episode 3424.000000, reward total was -11.000000. running mean: -14.083568
episode 3425.000000, reward total was -13.000000. running mean: -14.072732
episode 3426.000000, reward total was -14.000000. running mean: -14.072005
episode 3427.000000, reward total was -11.000000. running mean: -14.041285
episode 3428.000000, reward total was -12.000000. running mean: -14.020872
episode 3429.000000, reward total was -15.000000. running mean: -14.030663
episode 3430.000000, reward total was -13.000000. running mean: -14.020357
episode 3431.000000, reward total was -12.000000. running mean: -14.000153
episode 3432.000000, reward total was -19.000000. running mean: -14.050152
episode 3433.000000, reward total was -13.000000. running mean: -14.039650
episode 3434.000000, reward total was -16.000000. running mean: -14.059254
episode 3435.000000, rewa

episode 3532.000000, reward total was -11.000000. running mean: -13.865391
episode 3533.000000, reward total was -13.000000. running mean: -13.856737
episode 3534.000000, reward total was -11.000000. running mean: -13.828170
episode 3535.000000, reward total was -20.000000. running mean: -13.889888
episode 3536.000000, reward total was -9.000000. running mean: -13.840989
episode 3537.000000, reward total was -17.000000. running mean: -13.872579
episode 3538.000000, reward total was -14.000000. running mean: -13.873853
episode 3539.000000, reward total was -16.000000. running mean: -13.895115
episode 3540.000000, reward total was -15.000000. running mean: -13.906164
episode 3541.000000, reward total was -15.000000. running mean: -13.917102
episode 3542.000000, reward total was -11.000000. running mean: -13.887931
episode 3543.000000, reward total was -17.000000. running mean: -13.919052
episode 3544.000000, reward total was -12.000000. running mean: -13.899861
episode 3545.000000, rewar

episode 3642.000000, reward total was -19.000000. running mean: -13.833363
episode 3643.000000, reward total was -12.000000. running mean: -13.815030
episode 3644.000000, reward total was -15.000000. running mean: -13.826879
episode 3645.000000, reward total was -17.000000. running mean: -13.858611
episode 3646.000000, reward total was -15.000000. running mean: -13.870024
episode 3647.000000, reward total was -14.000000. running mean: -13.871324
episode 3648.000000, reward total was -13.000000. running mean: -13.862611
episode 3649.000000, reward total was -16.000000. running mean: -13.883985
episode 3650.000000, reward total was -20.000000. running mean: -13.945145
episode 3651.000000, reward total was -15.000000. running mean: -13.955694
episode 3652.000000, reward total was -12.000000. running mean: -13.936137
episode 3653.000000, reward total was -14.000000. running mean: -13.936775
episode 3654.000000, reward total was -15.000000. running mean: -13.947407
episode 3655.000000, rewa

episode 3752.000000, reward total was -9.000000. running mean: -13.963642
episode 3753.000000, reward total was -15.000000. running mean: -13.974006
episode 3754.000000, reward total was -15.000000. running mean: -13.984266
episode 3755.000000, reward total was -15.000000. running mean: -13.994423
episode 3756.000000, reward total was -6.000000. running mean: -13.914479
episode 3757.000000, reward total was -13.000000. running mean: -13.905334
episode 3758.000000, reward total was -11.000000. running mean: -13.876281
episode 3759.000000, reward total was -12.000000. running mean: -13.857518
episode 3760.000000, reward total was -11.000000. running mean: -13.828943
episode 3761.000000, reward total was -17.000000. running mean: -13.860653
episode 3762.000000, reward total was -15.000000. running mean: -13.872047
episode 3763.000000, reward total was -15.000000. running mean: -13.883326
episode 3764.000000, reward total was -11.000000. running mean: -13.854493
episode 3765.000000, reward

episode 3862.000000, reward total was -6.000000. running mean: -13.851673
episode 3863.000000, reward total was -7.000000. running mean: -13.783157
episode 3864.000000, reward total was -14.000000. running mean: -13.785325
episode 3865.000000, reward total was -11.000000. running mean: -13.757472
episode 3866.000000, reward total was -15.000000. running mean: -13.769897
episode 3867.000000, reward total was -15.000000. running mean: -13.782198
episode 3868.000000, reward total was -11.000000. running mean: -13.754376
episode 3869.000000, reward total was -18.000000. running mean: -13.796832
episode 3870.000000, reward total was -17.000000. running mean: -13.828864
episode 3871.000000, reward total was -12.000000. running mean: -13.810575
episode 3872.000000, reward total was -15.000000. running mean: -13.822470
episode 3873.000000, reward total was -8.000000. running mean: -13.764245
episode 3874.000000, reward total was -14.000000. running mean: -13.766603
episode 3875.000000, reward 

episode 3972.000000, reward total was -18.000000. running mean: -13.086695
episode 3973.000000, reward total was -9.000000. running mean: -13.045828
episode 3974.000000, reward total was -10.000000. running mean: -13.015370
episode 3975.000000, reward total was -12.000000. running mean: -13.005216
episode 3976.000000, reward total was -15.000000. running mean: -13.025164
episode 3977.000000, reward total was -13.000000. running mean: -13.024912
episode 3978.000000, reward total was -8.000000. running mean: -12.974663
episode 3979.000000, reward total was -9.000000. running mean: -12.934917
episode 3980.000000, reward total was -14.000000. running mean: -12.945568
episode 3981.000000, reward total was -15.000000. running mean: -12.966112
episode 3982.000000, reward total was -7.000000. running mean: -12.906451
episode 3983.000000, reward total was -11.000000. running mean: -12.887386
episode 3984.000000, reward total was -12.000000. running mean: -12.878512
episode 3985.000000, reward t

episode 4082.000000, reward total was -16.000000. running mean: -12.337457
episode 4083.000000, reward total was -3.000000. running mean: -12.244083
episode 4084.000000, reward total was -15.000000. running mean: -12.271642
episode 4085.000000, reward total was -2.000000. running mean: -12.168925
episode 4086.000000, reward total was -11.000000. running mean: -12.157236
episode 4087.000000, reward total was -7.000000. running mean: -12.105664
episode 4088.000000, reward total was -3.000000. running mean: -12.014607
episode 4089.000000, reward total was -8.000000. running mean: -11.974461
episode 4090.000000, reward total was -9.000000. running mean: -11.944717
episode 4091.000000, reward total was -12.000000. running mean: -11.945269
episode 4092.000000, reward total was -5.000000. running mean: -11.875817
episode 4093.000000, reward total was -8.000000. running mean: -11.837059
episode 4094.000000, reward total was -10.000000. running mean: -11.818688
episode 4095.000000, reward total

episode 4192.000000, reward total was -4.000000. running mean: -11.386804
episode 4193.000000, reward total was -11.000000. running mean: -11.382936
episode 4194.000000, reward total was -13.000000. running mean: -11.399106
episode 4195.000000, reward total was -13.000000. running mean: -11.415115
episode 4196.000000, reward total was -14.000000. running mean: -11.440964
episode 4197.000000, reward total was -19.000000. running mean: -11.516555
episode 4198.000000, reward total was -11.000000. running mean: -11.511389
episode 4199.000000, reward total was -12.000000. running mean: -11.516275
episode 4200.000000, reward total was -15.000000. running mean: -11.551112
episode 4201.000000, reward total was -8.000000. running mean: -11.515601
episode 4202.000000, reward total was -14.000000. running mean: -11.540445
episode 4203.000000, reward total was -14.000000. running mean: -11.565041
episode 4204.000000, reward total was -17.000000. running mean: -11.619390
episode 4205.000000, reward

episode 4302.000000, reward total was -9.000000. running mean: -11.684703
episode 4303.000000, reward total was -9.000000. running mean: -11.657856
episode 4304.000000, reward total was -4.000000. running mean: -11.581277
episode 4305.000000, reward total was -15.000000. running mean: -11.615464
episode 4306.000000, reward total was -7.000000. running mean: -11.569310
episode 4307.000000, reward total was -18.000000. running mean: -11.633616
episode 4308.000000, reward total was -16.000000. running mean: -11.677280
episode 4309.000000, reward total was -3.000000. running mean: -11.590507
episode 4310.000000, reward total was -19.000000. running mean: -11.664602
episode 4311.000000, reward total was -9.000000. running mean: -11.637956
episode 4312.000000, reward total was -12.000000. running mean: -11.641577
episode 4313.000000, reward total was -7.000000. running mean: -11.595161
episode 4314.000000, reward total was -13.000000. running mean: -11.609209
episode 4315.000000, reward tota

episode 4412.000000, reward total was -9.000000. running mean: -11.134023
episode 4413.000000, reward total was -17.000000. running mean: -11.192682
episode 4414.000000, reward total was -6.000000. running mean: -11.140756
episode 4415.000000, reward total was -16.000000. running mean: -11.189348
episode 4416.000000, reward total was -5.000000. running mean: -11.127455
episode 4417.000000, reward total was -11.000000. running mean: -11.126180
episode 4418.000000, reward total was -11.000000. running mean: -11.124918
episode 4419.000000, reward total was -13.000000. running mean: -11.143669
episode 4420.000000, reward total was -9.000000. running mean: -11.122232
episode 4421.000000, reward total was -9.000000. running mean: -11.101010
episode 4422.000000, reward total was -7.000000. running mean: -11.060000
episode 4423.000000, reward total was -13.000000. running mean: -11.079400
episode 4424.000000, reward total was -13.000000. running mean: -11.098606
episode 4425.000000, reward tot

episode 4522.000000, reward total was 1.000000. running mean: -10.388305
episode 4523.000000, reward total was -7.000000. running mean: -10.354422
episode 4524.000000, reward total was -17.000000. running mean: -10.420878
episode 4525.000000, reward total was -14.000000. running mean: -10.456669
episode 4526.000000, reward total was -7.000000. running mean: -10.422102
episode 4527.000000, reward total was -7.000000. running mean: -10.387881
episode 4528.000000, reward total was -3.000000. running mean: -10.314003
episode 4529.000000, reward total was -17.000000. running mean: -10.380863
episode 4530.000000, reward total was -10.000000. running mean: -10.377054
episode 4531.000000, reward total was -12.000000. running mean: -10.393283
episode 4532.000000, reward total was -12.000000. running mean: -10.409351
episode 4533.000000, reward total was -15.000000. running mean: -10.455257
episode 4534.000000, reward total was -12.000000. running mean: -10.470705
episode 4535.000000, reward tot

episode 4632.000000, reward total was -8.000000. running mean: -10.497035
episode 4633.000000, reward total was -17.000000. running mean: -10.562065
episode 4634.000000, reward total was -13.000000. running mean: -10.586444
episode 4635.000000, reward total was -11.000000. running mean: -10.590580
episode 4636.000000, reward total was -15.000000. running mean: -10.634674
episode 4637.000000, reward total was -5.000000. running mean: -10.578327
episode 4638.000000, reward total was -9.000000. running mean: -10.562544
episode 4639.000000, reward total was -1.000000. running mean: -10.466918
episode 4640.000000, reward total was -18.000000. running mean: -10.542249
episode 4641.000000, reward total was -10.000000. running mean: -10.536827
episode 4642.000000, reward total was -4.000000. running mean: -10.471459
episode 4643.000000, reward total was -9.000000. running mean: -10.456744
episode 4644.000000, reward total was -7.000000. running mean: -10.422177
episode 4645.000000, reward tota

episode 4742.000000, reward total was -15.000000. running mean: -11.521912
episode 4743.000000, reward total was -11.000000. running mean: -11.516693
episode 4744.000000, reward total was -14.000000. running mean: -11.541526
episode 4745.000000, reward total was -17.000000. running mean: -11.596111
episode 4746.000000, reward total was -14.000000. running mean: -11.620150
episode 4747.000000, reward total was -14.000000. running mean: -11.643948
episode 4748.000000, reward total was -9.000000. running mean: -11.617509
episode 4749.000000, reward total was -16.000000. running mean: -11.661334
episode 4750.000000, reward total was -7.000000. running mean: -11.614720
episode 4751.000000, reward total was -8.000000. running mean: -11.578573
episode 4752.000000, reward total was -15.000000. running mean: -11.612787
episode 4753.000000, reward total was -13.000000. running mean: -11.626659
episode 4754.000000, reward total was -18.000000. running mean: -11.690393
episode 4755.000000, reward 

episode 4852.000000, reward total was -17.000000. running mean: -12.501328
episode 4853.000000, reward total was -17.000000. running mean: -12.546315
episode 4854.000000, reward total was -5.000000. running mean: -12.470852
episode 4855.000000, reward total was -13.000000. running mean: -12.476143
episode 4856.000000, reward total was -18.000000. running mean: -12.531382
episode 4857.000000, reward total was -15.000000. running mean: -12.556068
episode 4858.000000, reward total was -17.000000. running mean: -12.600507
episode 4859.000000, reward total was -9.000000. running mean: -12.564502
episode 4860.000000, reward total was -17.000000. running mean: -12.608857
episode 4861.000000, reward total was -17.000000. running mean: -12.652769
episode 4862.000000, reward total was -11.000000. running mean: -12.636241
episode 4863.000000, reward total was -14.000000. running mean: -12.649879
episode 4864.000000, reward total was -11.000000. running mean: -12.633380
episode 4865.000000, reward

episode 4962.000000, reward total was -4.000000. running mean: -12.564356
episode 4963.000000, reward total was -7.000000. running mean: -12.508712
episode 4964.000000, reward total was -10.000000. running mean: -12.483625
episode 4965.000000, reward total was -18.000000. running mean: -12.538789
episode 4966.000000, reward total was -16.000000. running mean: -12.573401
episode 4967.000000, reward total was -12.000000. running mean: -12.567667
episode 4968.000000, reward total was -15.000000. running mean: -12.591990
episode 4969.000000, reward total was -14.000000. running mean: -12.606070
episode 4970.000000, reward total was -11.000000. running mean: -12.590010
episode 4971.000000, reward total was -12.000000. running mean: -12.584109
episode 4972.000000, reward total was -15.000000. running mean: -12.608268
episode 4973.000000, reward total was 4.000000. running mean: -12.442186
episode 4974.000000, reward total was -8.000000. running mean: -12.397764
episode 4975.000000, reward to

episode 5072.000000, reward total was -8.000000. running mean: -11.224526
episode 5073.000000, reward total was -5.000000. running mean: -11.162281
episode 5074.000000, reward total was -15.000000. running mean: -11.200658
episode 5075.000000, reward total was -7.000000. running mean: -11.158651
episode 5076.000000, reward total was -14.000000. running mean: -11.187065
episode 5077.000000, reward total was -7.000000. running mean: -11.145194
episode 5078.000000, reward total was -10.000000. running mean: -11.133742
episode 5079.000000, reward total was -17.000000. running mean: -11.192405
episode 5080.000000, reward total was -11.000000. running mean: -11.190481
episode 5081.000000, reward total was -6.000000. running mean: -11.138576
episode 5082.000000, reward total was -10.000000. running mean: -11.127190
episode 5083.000000, reward total was -9.000000. running mean: -11.105918
episode 5084.000000, reward total was -10.000000. running mean: -11.094859
episode 5085.000000, reward tot

episode 5182.000000, reward total was -8.000000. running mean: -10.159385
episode 5183.000000, reward total was -10.000000. running mean: -10.157791
episode 5184.000000, reward total was -15.000000. running mean: -10.206213
episode 5185.000000, reward total was -8.000000. running mean: -10.184151
episode 5186.000000, reward total was -10.000000. running mean: -10.182310
episode 5187.000000, reward total was -16.000000. running mean: -10.240487
episode 5188.000000, reward total was -14.000000. running mean: -10.278082
episode 5189.000000, reward total was -18.000000. running mean: -10.355301
episode 5190.000000, reward total was -14.000000. running mean: -10.391748
episode 5191.000000, reward total was -11.000000. running mean: -10.397830
episode 5192.000000, reward total was -9.000000. running mean: -10.383852
episode 5193.000000, reward total was -10.000000. running mean: -10.380014
episode 5194.000000, reward total was 3.000000. running mean: -10.246213
episode 5195.000000, reward to

episode 5292.000000, reward total was -5.000000. running mean: -10.542299
episode 5293.000000, reward total was -14.000000. running mean: -10.576876
episode 5294.000000, reward total was -7.000000. running mean: -10.541107
episode 5295.000000, reward total was -7.000000. running mean: -10.505696
episode 5296.000000, reward total was -10.000000. running mean: -10.500639
episode 5297.000000, reward total was -16.000000. running mean: -10.555633
episode 5298.000000, reward total was -18.000000. running mean: -10.630076
episode 5299.000000, reward total was -15.000000. running mean: -10.673776
episode 5300.000000, reward total was -12.000000. running mean: -10.687038
episode 5301.000000, reward total was -12.000000. running mean: -10.700167
episode 5302.000000, reward total was -5.000000. running mean: -10.643166
episode 5303.000000, reward total was -10.000000. running mean: -10.636734
episode 5304.000000, reward total was -2.000000. running mean: -10.550367
episode 5305.000000, reward to

episode 5403.000000, reward total was -3.000000. running mean: -9.572591
episode 5404.000000, reward total was -15.000000. running mean: -9.626865
episode 5405.000000, reward total was -18.000000. running mean: -9.710597
episode 5406.000000, reward total was -14.000000. running mean: -9.753491
episode 5407.000000, reward total was -10.000000. running mean: -9.755956
episode 5408.000000, reward total was -17.000000. running mean: -9.828396
episode 5409.000000, reward total was -5.000000. running mean: -9.780112
episode 5410.000000, reward total was -9.000000. running mean: -9.772311
episode 5411.000000, reward total was -12.000000. running mean: -9.794588
episode 5412.000000, reward total was -12.000000. running mean: -9.816642
episode 5413.000000, reward total was -14.000000. running mean: -9.858476
episode 5414.000000, reward total was 2.000000. running mean: -9.739891
episode 5415.000000, reward total was -13.000000. running mean: -9.772492
episode 5416.000000, reward total was -5.00

episode 5515.000000, reward total was -6.000000. running mean: -9.354381
episode 5516.000000, reward total was -11.000000. running mean: -9.370837
episode 5517.000000, reward total was -14.000000. running mean: -9.417128
episode 5518.000000, reward total was -2.000000. running mean: -9.342957
episode 5519.000000, reward total was -14.000000. running mean: -9.389528
episode 5520.000000, reward total was -11.000000. running mean: -9.405632
episode 5521.000000, reward total was -8.000000. running mean: -9.391576
episode 5522.000000, reward total was -5.000000. running mean: -9.347660
episode 5523.000000, reward total was -15.000000. running mean: -9.404184
episode 5524.000000, reward total was -12.000000. running mean: -9.430142
episode 5525.000000, reward total was -12.000000. running mean: -9.455840
episode 5526.000000, reward total was -6.000000. running mean: -9.421282
episode 5527.000000, reward total was -15.000000. running mean: -9.477069
episode 5528.000000, reward total was -15.0

episode 5627.000000, reward total was -17.000000. running mean: -9.125619
episode 5628.000000, reward total was -8.000000. running mean: -9.114363
episode 5629.000000, reward total was -10.000000. running mean: -9.123219
episode 5630.000000, reward total was -17.000000. running mean: -9.201987
episode 5631.000000, reward total was -6.000000. running mean: -9.169967
episode 5632.000000, reward total was -15.000000. running mean: -9.228268
episode 5633.000000, reward total was -13.000000. running mean: -9.265985
episode 5634.000000, reward total was -15.000000. running mean: -9.323325
episode 5635.000000, reward total was -17.000000. running mean: -9.400092
episode 5636.000000, reward total was -15.000000. running mean: -9.456091
episode 5637.000000, reward total was -11.000000. running mean: -9.471530
episode 5638.000000, reward total was -15.000000. running mean: -9.526815
episode 5639.000000, reward total was -9.000000. running mean: -9.521547
episode 5640.000000, reward total was -9.

episode 5739.000000, reward total was -5.000000. running mean: -9.428242
episode 5740.000000, reward total was -11.000000. running mean: -9.443960
episode 5741.000000, reward total was -15.000000. running mean: -9.499520
episode 5742.000000, reward total was -7.000000. running mean: -9.474525
episode 5743.000000, reward total was -3.000000. running mean: -9.409780
episode 5744.000000, reward total was -12.000000. running mean: -9.435682
episode 5745.000000, reward total was -4.000000. running mean: -9.381325
episode 5746.000000, reward total was -13.000000. running mean: -9.417512
episode 5747.000000, reward total was -5.000000. running mean: -9.373337
episode 5748.000000, reward total was -7.000000. running mean: -9.349604
episode 5749.000000, reward total was -5.000000. running mean: -9.306108
episode 5750.000000, reward total was -18.000000. running mean: -9.393046
episode 5751.000000, reward total was -8.000000. running mean: -9.379116
episode 5752.000000, reward total was -5.00000

episode 5850.000000, reward total was -1.000000. running mean: -9.980791
episode 5851.000000, reward total was -19.000000. running mean: -10.070983
episode 5852.000000, reward total was -14.000000. running mean: -10.110273
episode 5853.000000, reward total was -12.000000. running mean: -10.129170
episode 5854.000000, reward total was -15.000000. running mean: -10.177879
episode 5855.000000, reward total was -4.000000. running mean: -10.116100
episode 5856.000000, reward total was -12.000000. running mean: -10.134939
episode 5857.000000, reward total was -13.000000. running mean: -10.163589
episode 5858.000000, reward total was -11.000000. running mean: -10.171954
episode 5859.000000, reward total was -15.000000. running mean: -10.220234
episode 5860.000000, reward total was 1.000000. running mean: -10.108032
episode 5861.000000, reward total was -12.000000. running mean: -10.126951
episode 5862.000000, reward total was -10.000000. running mean: -10.125682
episode 5863.000000, reward to

episode 5961.000000, reward total was -12.000000. running mean: -9.966117
episode 5962.000000, reward total was -5.000000. running mean: -9.916456
episode 5963.000000, reward total was -5.000000. running mean: -9.867292
episode 5964.000000, reward total was -4.000000. running mean: -9.808619
episode 5965.000000, reward total was -16.000000. running mean: -9.870533
episode 5966.000000, reward total was -8.000000. running mean: -9.851827
episode 5967.000000, reward total was 4.000000. running mean: -9.713309
episode 5968.000000, reward total was -8.000000. running mean: -9.696176
episode 5969.000000, reward total was -5.000000. running mean: -9.649214
episode 5970.000000, reward total was -11.000000. running mean: -9.662722
episode 5971.000000, reward total was -1.000000. running mean: -9.576095
episode 5972.000000, reward total was -13.000000. running mean: -9.610334
episode 5973.000000, reward total was -7.000000. running mean: -9.584230
episode 5974.000000, reward total was -9.000000.

In [4]:
play_game(env, model)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


Episode finished without success, accumulated reward = -3.0
