In [1]:
import gym
import numpy as np
rm='Pong-v0'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

In [3]:
from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make(rm)
# model initialization
H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):
  # preprocess the observation, set input to network to be difference image
  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  logger.warn(
  deprecation(
  deprecation(


In [4]:
%time hist1 = train_model(env, model, total_episodes=6000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -20.000000. running mean: -20.990000
episode 3.000000, reward total was -21.000000. running mean: -20.990100
episode 4.000000, reward total was -20.000000. running mean: -20.980199
episode 5.000000, reward total was -21.000000. running mean: -20.980397
episode 6.000000, reward total was -21.000000. running mean: -20.980593
episode 7.000000, reward total was -21.000000. running mean: -20.980787
episode 8.000000, reward total was -19.000000. running mean: -20.960979
episode 9.000000, reward total was -21.000000. running mean: -20.961369
episode 10.000000, reward total was -21.000000. running mean: -20.961756
episode 11.000000, reward total was -21.000000. running mean: -20.962138
episode 12.000000, reward total was -18.000000. running mean: -20.932517
episode 13.000000, reward total was -21.000000. running mean: -20.933192
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.659457
episode 115.000000, reward total was -21.000000. running mean: -20.662863
episode 116.000000, reward total was -20.000000. running mean: -20.656234
episode 117.000000, reward total was -19.000000. running mean: -20.639672
episode 118.000000, reward total was -20.000000. running mean: -20.633275
episode 119.000000, reward total was -21.000000. running mean: -20.636942
episode 120.000000, reward total was -21.000000. running mean: -20.640573
episode 121.000000, reward total was -19.000000. running mean: -20.624167
episode 122.000000, reward total was -20.000000. running mean: -20.617925
episode 123.000000, reward total was -20.000000. running mean: -20.611746
episode 124.000000, reward total was -21.000000. running mean: -20.615629
episode 125.000000, reward total was -21.000000. running mean: -20.619472
episode 126.000000, reward total was -21.000000. running mean: -20.623278
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.328372
episode 226.000000, reward total was -20.000000. running mean: -20.325088
episode 227.000000, reward total was -20.000000. running mean: -20.321837
episode 228.000000, reward total was -19.000000. running mean: -20.308619
episode 229.000000, reward total was -20.000000. running mean: -20.305533
episode 230.000000, reward total was -18.000000. running mean: -20.282477
episode 231.000000, reward total was -21.000000. running mean: -20.289653
episode 232.000000, reward total was -21.000000. running mean: -20.296756
episode 233.000000, reward total was -21.000000. running mean: -20.303789
episode 234.000000, reward total was -20.000000. running mean: -20.300751
episode 235.000000, reward total was -21.000000. running mean: -20.307743
episode 236.000000, reward total was -17.000000. running mean: -20.274666
episode 237.000000, reward total was -18.000000. running mean: -20.251919
episode 238.000000, reward total was -

episode 336.000000, reward total was -21.000000. running mean: -20.010991
episode 337.000000, reward total was -19.000000. running mean: -20.000881
episode 338.000000, reward total was -21.000000. running mean: -20.010872
episode 339.000000, reward total was -20.000000. running mean: -20.010763
episode 340.000000, reward total was -21.000000. running mean: -20.020656
episode 341.000000, reward total was -21.000000. running mean: -20.030449
episode 342.000000, reward total was -20.000000. running mean: -20.030145
episode 343.000000, reward total was -20.000000. running mean: -20.029843
episode 344.000000, reward total was -17.000000. running mean: -19.999545
episode 345.000000, reward total was -20.000000. running mean: -19.999549
episode 346.000000, reward total was -19.000000. running mean: -19.989554
episode 347.000000, reward total was -21.000000. running mean: -19.999658
episode 348.000000, reward total was -17.000000. running mean: -19.969662
episode 349.000000, reward total was -

episode 447.000000, reward total was -18.000000. running mean: -19.628542
episode 448.000000, reward total was -18.000000. running mean: -19.612256
episode 449.000000, reward total was -19.000000. running mean: -19.606134
episode 450.000000, reward total was -20.000000. running mean: -19.610072
episode 451.000000, reward total was -19.000000. running mean: -19.603971
episode 452.000000, reward total was -16.000000. running mean: -19.567932
episode 453.000000, reward total was -19.000000. running mean: -19.562252
episode 454.000000, reward total was -21.000000. running mean: -19.576630
episode 455.000000, reward total was -19.000000. running mean: -19.570864
episode 456.000000, reward total was -21.000000. running mean: -19.585155
episode 457.000000, reward total was -21.000000. running mean: -19.599303
episode 458.000000, reward total was -20.000000. running mean: -19.603310
episode 459.000000, reward total was -20.000000. running mean: -19.607277
episode 460.000000, reward total was -

episode 558.000000, reward total was -19.000000. running mean: -19.356434
episode 559.000000, reward total was -19.000000. running mean: -19.352869
episode 560.000000, reward total was -20.000000. running mean: -19.359341
episode 561.000000, reward total was -19.000000. running mean: -19.355747
episode 562.000000, reward total was -20.000000. running mean: -19.362190
episode 563.000000, reward total was -18.000000. running mean: -19.348568
episode 564.000000, reward total was -20.000000. running mean: -19.355082
episode 565.000000, reward total was -14.000000. running mean: -19.301531
episode 566.000000, reward total was -21.000000. running mean: -19.318516
episode 567.000000, reward total was -19.000000. running mean: -19.315331
episode 568.000000, reward total was -21.000000. running mean: -19.332178
episode 569.000000, reward total was -17.000000. running mean: -19.308856
episode 570.000000, reward total was -19.000000. running mean: -19.305767
episode 571.000000, reward total was -

episode 669.000000, reward total was -15.000000. running mean: -19.015762
episode 670.000000, reward total was -20.000000. running mean: -19.025604
episode 671.000000, reward total was -18.000000. running mean: -19.015348
episode 672.000000, reward total was -17.000000. running mean: -18.995195
episode 673.000000, reward total was -20.000000. running mean: -19.005243
episode 674.000000, reward total was -17.000000. running mean: -18.985191
episode 675.000000, reward total was -17.000000. running mean: -18.965339
episode 676.000000, reward total was -20.000000. running mean: -18.975685
episode 677.000000, reward total was -18.000000. running mean: -18.965928
episode 678.000000, reward total was -18.000000. running mean: -18.956269
episode 679.000000, reward total was -17.000000. running mean: -18.936706
episode 680.000000, reward total was -20.000000. running mean: -18.947339
episode 681.000000, reward total was -18.000000. running mean: -18.937866
episode 682.000000, reward total was -

episode 780.000000, reward total was -17.000000. running mean: -18.679284
episode 781.000000, reward total was -18.000000. running mean: -18.672491
episode 782.000000, reward total was -18.000000. running mean: -18.665766
episode 783.000000, reward total was -13.000000. running mean: -18.609109
episode 784.000000, reward total was -20.000000. running mean: -18.623018
episode 785.000000, reward total was -19.000000. running mean: -18.626787
episode 786.000000, reward total was -20.000000. running mean: -18.640519
episode 787.000000, reward total was -17.000000. running mean: -18.624114
episode 788.000000, reward total was -18.000000. running mean: -18.617873
episode 789.000000, reward total was -18.000000. running mean: -18.611694
episode 790.000000, reward total was -19.000000. running mean: -18.615577
episode 791.000000, reward total was -18.000000. running mean: -18.609422
episode 792.000000, reward total was -19.000000. running mean: -18.613327
episode 793.000000, reward total was -

episode 891.000000, reward total was -19.000000. running mean: -18.340988
episode 892.000000, reward total was -17.000000. running mean: -18.327578
episode 893.000000, reward total was -17.000000. running mean: -18.314302
episode 894.000000, reward total was -12.000000. running mean: -18.251159
episode 895.000000, reward total was -17.000000. running mean: -18.238648
episode 896.000000, reward total was -20.000000. running mean: -18.256261
episode 897.000000, reward total was -17.000000. running mean: -18.243699
episode 898.000000, reward total was -20.000000. running mean: -18.261262
episode 899.000000, reward total was -17.000000. running mean: -18.248649
episode 900.000000, reward total was -14.000000. running mean: -18.206163
episode 901.000000, reward total was -20.000000. running mean: -18.224101
episode 902.000000, reward total was -19.000000. running mean: -18.231860
episode 903.000000, reward total was -20.000000. running mean: -18.249541
episode 904.000000, reward total was -

episode 1002.000000, reward total was -21.000000. running mean: -18.217225
episode 1003.000000, reward total was -15.000000. running mean: -18.185053
episode 1004.000000, reward total was -17.000000. running mean: -18.173202
episode 1005.000000, reward total was -18.000000. running mean: -18.171470
episode 1006.000000, reward total was -18.000000. running mean: -18.169755
episode 1007.000000, reward total was -14.000000. running mean: -18.128058
episode 1008.000000, reward total was -20.000000. running mean: -18.146777
episode 1009.000000, reward total was -17.000000. running mean: -18.135310
episode 1010.000000, reward total was -19.000000. running mean: -18.143956
episode 1011.000000, reward total was -16.000000. running mean: -18.122517
episode 1012.000000, reward total was -20.000000. running mean: -18.141292
episode 1013.000000, reward total was -17.000000. running mean: -18.129879
episode 1014.000000, reward total was -20.000000. running mean: -18.148580
episode 1015.000000, rewa

episode 1112.000000, reward total was -15.000000. running mean: -17.957587
episode 1113.000000, reward total was -21.000000. running mean: -17.988011
episode 1114.000000, reward total was -18.000000. running mean: -17.988131
episode 1115.000000, reward total was -17.000000. running mean: -17.978249
episode 1116.000000, reward total was -17.000000. running mean: -17.968467
episode 1117.000000, reward total was -19.000000. running mean: -17.978782
episode 1118.000000, reward total was -16.000000. running mean: -17.958994
episode 1119.000000, reward total was -16.000000. running mean: -17.939404
episode 1120.000000, reward total was -18.000000. running mean: -17.940010
episode 1121.000000, reward total was -15.000000. running mean: -17.910610
episode 1122.000000, reward total was -15.000000. running mean: -17.881504
episode 1123.000000, reward total was -17.000000. running mean: -17.872689
episode 1124.000000, reward total was -19.000000. running mean: -17.883962
episode 1125.000000, rewa

episode 1222.000000, reward total was -19.000000. running mean: -17.518637
episode 1223.000000, reward total was -19.000000. running mean: -17.533450
episode 1224.000000, reward total was -18.000000. running mean: -17.538116
episode 1225.000000, reward total was -18.000000. running mean: -17.542735
episode 1226.000000, reward total was -18.000000. running mean: -17.547307
episode 1227.000000, reward total was -19.000000. running mean: -17.561834
episode 1228.000000, reward total was -14.000000. running mean: -17.526216
episode 1229.000000, reward total was -14.000000. running mean: -17.490954
episode 1230.000000, reward total was -15.000000. running mean: -17.466044
episode 1231.000000, reward total was -16.000000. running mean: -17.451384
episode 1232.000000, reward total was -21.000000. running mean: -17.486870
episode 1233.000000, reward total was -15.000000. running mean: -17.462001
episode 1234.000000, reward total was -18.000000. running mean: -17.467381
episode 1235.000000, rewa

episode 1332.000000, reward total was -18.000000. running mean: -17.213726
episode 1333.000000, reward total was -14.000000. running mean: -17.181589
episode 1334.000000, reward total was -16.000000. running mean: -17.169773
episode 1335.000000, reward total was -15.000000. running mean: -17.148075
episode 1336.000000, reward total was -20.000000. running mean: -17.176595
episode 1337.000000, reward total was -19.000000. running mean: -17.194829
episode 1338.000000, reward total was -20.000000. running mean: -17.222880
episode 1339.000000, reward total was -16.000000. running mean: -17.210652
episode 1340.000000, reward total was -15.000000. running mean: -17.188545
episode 1341.000000, reward total was -17.000000. running mean: -17.186660
episode 1342.000000, reward total was -12.000000. running mean: -17.134793
episode 1343.000000, reward total was -17.000000. running mean: -17.133445
episode 1344.000000, reward total was -20.000000. running mean: -17.162111
episode 1345.000000, rewa

episode 1442.000000, reward total was -16.000000. running mean: -16.583600
episode 1443.000000, reward total was -21.000000. running mean: -16.627764
episode 1444.000000, reward total was -14.000000. running mean: -16.601487
episode 1445.000000, reward total was -14.000000. running mean: -16.575472
episode 1446.000000, reward total was -17.000000. running mean: -16.579717
episode 1447.000000, reward total was -12.000000. running mean: -16.533920
episode 1448.000000, reward total was -16.000000. running mean: -16.528581
episode 1449.000000, reward total was -12.000000. running mean: -16.483295
episode 1450.000000, reward total was -15.000000. running mean: -16.468462
episode 1451.000000, reward total was -19.000000. running mean: -16.493777
episode 1452.000000, reward total was -17.000000. running mean: -16.498839
episode 1453.000000, reward total was -17.000000. running mean: -16.503851
episode 1454.000000, reward total was -17.000000. running mean: -16.508813
episode 1455.000000, rewa

episode 1552.000000, reward total was -19.000000. running mean: -16.008855
episode 1553.000000, reward total was -15.000000. running mean: -15.998766
episode 1554.000000, reward total was -17.000000. running mean: -16.008778
episode 1555.000000, reward total was -13.000000. running mean: -15.978691
episode 1556.000000, reward total was -13.000000. running mean: -15.948904
episode 1557.000000, reward total was -17.000000. running mean: -15.959415
episode 1558.000000, reward total was -16.000000. running mean: -15.959820
episode 1559.000000, reward total was -18.000000. running mean: -15.980222
episode 1560.000000, reward total was -20.000000. running mean: -16.020420
episode 1561.000000, reward total was -15.000000. running mean: -16.010216
episode 1562.000000, reward total was -17.000000. running mean: -16.020114
episode 1563.000000, reward total was -18.000000. running mean: -16.039913
episode 1564.000000, reward total was -19.000000. running mean: -16.069513
episode 1565.000000, rewa

episode 1662.000000, reward total was -17.000000. running mean: -16.023904
episode 1663.000000, reward total was -13.000000. running mean: -15.993665
episode 1664.000000, reward total was -16.000000. running mean: -15.993729
episode 1665.000000, reward total was -16.000000. running mean: -15.993791
episode 1666.000000, reward total was -16.000000. running mean: -15.993854
episode 1667.000000, reward total was -14.000000. running mean: -15.973915
episode 1668.000000, reward total was -13.000000. running mean: -15.944176
episode 1669.000000, reward total was -14.000000. running mean: -15.924734
episode 1670.000000, reward total was -15.000000. running mean: -15.915487
episode 1671.000000, reward total was -19.000000. running mean: -15.946332
episode 1672.000000, reward total was -14.000000. running mean: -15.926869
episode 1673.000000, reward total was -15.000000. running mean: -15.917600
episode 1674.000000, reward total was -19.000000. running mean: -15.948424
episode 1675.000000, rewa

episode 1772.000000, reward total was -16.000000. running mean: -15.513540
episode 1773.000000, reward total was -9.000000. running mean: -15.448405
episode 1774.000000, reward total was -16.000000. running mean: -15.453920
episode 1775.000000, reward total was -15.000000. running mean: -15.449381
episode 1776.000000, reward total was -8.000000. running mean: -15.374887
episode 1777.000000, reward total was -13.000000. running mean: -15.351139
episode 1778.000000, reward total was -20.000000. running mean: -15.397627
episode 1779.000000, reward total was -14.000000. running mean: -15.383651
episode 1780.000000, reward total was -16.000000. running mean: -15.389814
episode 1781.000000, reward total was -9.000000. running mean: -15.325916
episode 1782.000000, reward total was -18.000000. running mean: -15.352657
episode 1783.000000, reward total was -18.000000. running mean: -15.379131
episode 1784.000000, reward total was -16.000000. running mean: -15.385339
episode 1785.000000, reward 

episode 1882.000000, reward total was -15.000000. running mean: -14.765127
episode 1883.000000, reward total was -12.000000. running mean: -14.737475
episode 1884.000000, reward total was -16.000000. running mean: -14.750101
episode 1885.000000, reward total was -12.000000. running mean: -14.722600
episode 1886.000000, reward total was -17.000000. running mean: -14.745374
episode 1887.000000, reward total was -15.000000. running mean: -14.747920
episode 1888.000000, reward total was -13.000000. running mean: -14.730441
episode 1889.000000, reward total was -11.000000. running mean: -14.693136
episode 1890.000000, reward total was -17.000000. running mean: -14.716205
episode 1891.000000, reward total was -13.000000. running mean: -14.699043
episode 1892.000000, reward total was -11.000000. running mean: -14.662052
episode 1893.000000, reward total was -14.000000. running mean: -14.655432
episode 1894.000000, reward total was -18.000000. running mean: -14.688878
episode 1895.000000, rewa

episode 1992.000000, reward total was -13.000000. running mean: -14.467348
episode 1993.000000, reward total was -18.000000. running mean: -14.502675
episode 1994.000000, reward total was -17.000000. running mean: -14.527648
episode 1995.000000, reward total was -17.000000. running mean: -14.552372
episode 1996.000000, reward total was -11.000000. running mean: -14.516848
episode 1997.000000, reward total was -15.000000. running mean: -14.521679
episode 1998.000000, reward total was -17.000000. running mean: -14.546463
episode 1999.000000, reward total was -8.000000. running mean: -14.480998
episode 2000.000000, reward total was -16.000000. running mean: -14.496188
episode 2001.000000, reward total was -14.000000. running mean: -14.491226
episode 2002.000000, reward total was -12.000000. running mean: -14.466314
episode 2003.000000, reward total was -14.000000. running mean: -14.461651
episode 2004.000000, reward total was -13.000000. running mean: -14.447034
episode 2005.000000, rewar

episode 2102.000000, reward total was -17.000000. running mean: -13.952190
episode 2103.000000, reward total was -17.000000. running mean: -13.982668
episode 2104.000000, reward total was -5.000000. running mean: -13.892841
episode 2105.000000, reward total was -14.000000. running mean: -13.893913
episode 2106.000000, reward total was -17.000000. running mean: -13.924973
episode 2107.000000, reward total was -13.000000. running mean: -13.915724
episode 2108.000000, reward total was -15.000000. running mean: -13.926567
episode 2109.000000, reward total was -12.000000. running mean: -13.907301
episode 2110.000000, reward total was -11.000000. running mean: -13.878228
episode 2111.000000, reward total was -14.000000. running mean: -13.879446
episode 2112.000000, reward total was -7.000000. running mean: -13.810651
episode 2113.000000, reward total was -12.000000. running mean: -13.792545
episode 2114.000000, reward total was -12.000000. running mean: -13.774619
episode 2115.000000, reward

episode 2212.000000, reward total was -8.000000. running mean: -13.566407
episode 2213.000000, reward total was -19.000000. running mean: -13.620743
episode 2214.000000, reward total was -13.000000. running mean: -13.614535
episode 2215.000000, reward total was -14.000000. running mean: -13.618390
episode 2216.000000, reward total was -11.000000. running mean: -13.592206
episode 2217.000000, reward total was -13.000000. running mean: -13.586284
episode 2218.000000, reward total was -16.000000. running mean: -13.610421
episode 2219.000000, reward total was -12.000000. running mean: -13.594317
episode 2220.000000, reward total was -13.000000. running mean: -13.588374
episode 2221.000000, reward total was -9.000000. running mean: -13.542490
episode 2222.000000, reward total was -18.000000. running mean: -13.587065
episode 2223.000000, reward total was -13.000000. running mean: -13.581194
episode 2224.000000, reward total was -12.000000. running mean: -13.565382
episode 2225.000000, reward

episode 2322.000000, reward total was -13.000000. running mean: -13.275009
episode 2323.000000, reward total was -15.000000. running mean: -13.292259
episode 2324.000000, reward total was -15.000000. running mean: -13.309336
episode 2325.000000, reward total was -10.000000. running mean: -13.276243
episode 2326.000000, reward total was -12.000000. running mean: -13.263481
episode 2327.000000, reward total was -14.000000. running mean: -13.270846
episode 2328.000000, reward total was -16.000000. running mean: -13.298137
episode 2329.000000, reward total was -13.000000. running mean: -13.295156
episode 2330.000000, reward total was -11.000000. running mean: -13.272204
episode 2331.000000, reward total was -12.000000. running mean: -13.259482
episode 2332.000000, reward total was -11.000000. running mean: -13.236888
episode 2333.000000, reward total was -9.000000. running mean: -13.194519
episode 2334.000000, reward total was -17.000000. running mean: -13.232574
episode 2335.000000, rewar

episode 2432.000000, reward total was -13.000000. running mean: -13.299421
episode 2433.000000, reward total was -12.000000. running mean: -13.286427
episode 2434.000000, reward total was -10.000000. running mean: -13.253563
episode 2435.000000, reward total was -15.000000. running mean: -13.271027
episode 2436.000000, reward total was -6.000000. running mean: -13.198317
episode 2437.000000, reward total was -9.000000. running mean: -13.156334
episode 2438.000000, reward total was -16.000000. running mean: -13.184770
episode 2439.000000, reward total was -18.000000. running mean: -13.232922
episode 2440.000000, reward total was -16.000000. running mean: -13.260593
episode 2441.000000, reward total was -11.000000. running mean: -13.237987
episode 2442.000000, reward total was -13.000000. running mean: -13.235607
episode 2443.000000, reward total was -12.000000. running mean: -13.223251
episode 2444.000000, reward total was -13.000000. running mean: -13.221019
episode 2445.000000, reward

episode 2542.000000, reward total was -16.000000. running mean: -13.024603
episode 2543.000000, reward total was -14.000000. running mean: -13.034357
episode 2544.000000, reward total was -11.000000. running mean: -13.014014
episode 2545.000000, reward total was -17.000000. running mean: -13.053873
episode 2546.000000, reward total was -9.000000. running mean: -13.013335
episode 2547.000000, reward total was -6.000000. running mean: -12.943201
episode 2548.000000, reward total was -13.000000. running mean: -12.943769
episode 2549.000000, reward total was -16.000000. running mean: -12.974332
episode 2550.000000, reward total was -11.000000. running mean: -12.954588
episode 2551.000000, reward total was -13.000000. running mean: -12.955042
episode 2552.000000, reward total was -11.000000. running mean: -12.935492
episode 2553.000000, reward total was -13.000000. running mean: -12.936137
episode 2554.000000, reward total was -15.000000. running mean: -12.956776
episode 2555.000000, reward

episode 2652.000000, reward total was -16.000000. running mean: -12.709946
episode 2653.000000, reward total was -14.000000. running mean: -12.722847
episode 2654.000000, reward total was -15.000000. running mean: -12.745619
episode 2655.000000, reward total was -15.000000. running mean: -12.768162
episode 2656.000000, reward total was -18.000000. running mean: -12.820481
episode 2657.000000, reward total was -13.000000. running mean: -12.822276
episode 2658.000000, reward total was -12.000000. running mean: -12.814053
episode 2659.000000, reward total was -19.000000. running mean: -12.875913
episode 2660.000000, reward total was -4.000000. running mean: -12.787154
episode 2661.000000, reward total was -10.000000. running mean: -12.759282
episode 2662.000000, reward total was -9.000000. running mean: -12.721689
episode 2663.000000, reward total was -16.000000. running mean: -12.754472
episode 2664.000000, reward total was -10.000000. running mean: -12.726928
episode 2665.000000, reward

episode 2762.000000, reward total was -17.000000. running mean: -12.893806
episode 2763.000000, reward total was -12.000000. running mean: -12.884868
episode 2764.000000, reward total was -8.000000. running mean: -12.836020
episode 2765.000000, reward total was -17.000000. running mean: -12.877659
episode 2766.000000, reward total was -15.000000. running mean: -12.898883
episode 2767.000000, reward total was -15.000000. running mean: -12.919894
episode 2768.000000, reward total was -12.000000. running mean: -12.910695
episode 2769.000000, reward total was -11.000000. running mean: -12.891588
episode 2770.000000, reward total was -12.000000. running mean: -12.882672
episode 2771.000000, reward total was -16.000000. running mean: -12.913845
episode 2772.000000, reward total was -15.000000. running mean: -12.934707
episode 2773.000000, reward total was -15.000000. running mean: -12.955360
episode 2774.000000, reward total was -16.000000. running mean: -12.985806
episode 2775.000000, rewar

episode 2872.000000, reward total was -14.000000. running mean: -12.444790
episode 2873.000000, reward total was -12.000000. running mean: -12.440342
episode 2874.000000, reward total was -12.000000. running mean: -12.435938
episode 2875.000000, reward total was -7.000000. running mean: -12.381579
episode 2876.000000, reward total was -10.000000. running mean: -12.357763
episode 2877.000000, reward total was -12.000000. running mean: -12.354185
episode 2878.000000, reward total was -10.000000. running mean: -12.330644
episode 2879.000000, reward total was -18.000000. running mean: -12.387337
episode 2880.000000, reward total was -6.000000. running mean: -12.323464
episode 2881.000000, reward total was -8.000000. running mean: -12.280229
episode 2882.000000, reward total was -15.000000. running mean: -12.307427
episode 2883.000000, reward total was -11.000000. running mean: -12.294353
episode 2884.000000, reward total was -15.000000. running mean: -12.321409
episode 2885.000000, reward 

episode 2982.000000, reward total was -15.000000. running mean: -12.631347
episode 2983.000000, reward total was -16.000000. running mean: -12.665033
episode 2984.000000, reward total was -12.000000. running mean: -12.658383
episode 2985.000000, reward total was -16.000000. running mean: -12.691799
episode 2986.000000, reward total was -10.000000. running mean: -12.664881
episode 2987.000000, reward total was -16.000000. running mean: -12.698232
episode 2988.000000, reward total was -9.000000. running mean: -12.661250
episode 2989.000000, reward total was -14.000000. running mean: -12.674637
episode 2990.000000, reward total was -12.000000. running mean: -12.667891
episode 2991.000000, reward total was -12.000000. running mean: -12.661212
episode 2992.000000, reward total was -9.000000. running mean: -12.624600
episode 2993.000000, reward total was -13.000000. running mean: -12.628354
episode 2994.000000, reward total was -9.000000. running mean: -12.592070
episode 2995.000000, reward 

episode 3092.000000, reward total was -16.000000. running mean: -12.640429
episode 3093.000000, reward total was -9.000000. running mean: -12.604024
episode 3094.000000, reward total was -11.000000. running mean: -12.587984
episode 3095.000000, reward total was -12.000000. running mean: -12.582104
episode 3096.000000, reward total was -9.000000. running mean: -12.546283
episode 3097.000000, reward total was -12.000000. running mean: -12.540820
episode 3098.000000, reward total was -12.000000. running mean: -12.535412
episode 3099.000000, reward total was -15.000000. running mean: -12.560058
episode 3100.000000, reward total was -12.000000. running mean: -12.554458
episode 3101.000000, reward total was -15.000000. running mean: -12.578913
episode 3102.000000, reward total was -14.000000. running mean: -12.593124
episode 3103.000000, reward total was -11.000000. running mean: -12.577193
episode 3104.000000, reward total was -7.000000. running mean: -12.521421
episode 3105.000000, reward 

episode 3202.000000, reward total was -11.000000. running mean: -12.647916
episode 3203.000000, reward total was -17.000000. running mean: -12.691437
episode 3204.000000, reward total was -11.000000. running mean: -12.674523
episode 3205.000000, reward total was -7.000000. running mean: -12.617778
episode 3206.000000, reward total was -13.000000. running mean: -12.621600
episode 3207.000000, reward total was -9.000000. running mean: -12.585384
episode 3208.000000, reward total was -13.000000. running mean: -12.589530
episode 3209.000000, reward total was -13.000000. running mean: -12.593635
episode 3210.000000, reward total was -15.000000. running mean: -12.617698
episode 3211.000000, reward total was -14.000000. running mean: -12.631521
episode 3212.000000, reward total was -7.000000. running mean: -12.575206
episode 3213.000000, reward total was -8.000000. running mean: -12.529454
episode 3214.000000, reward total was -14.000000. running mean: -12.544160
episode 3215.000000, reward t

episode 3312.000000, reward total was -16.000000. running mean: -11.917965
episode 3313.000000, reward total was -13.000000. running mean: -11.928786
episode 3314.000000, reward total was -14.000000. running mean: -11.949498
episode 3315.000000, reward total was -19.000000. running mean: -12.020003
episode 3316.000000, reward total was -16.000000. running mean: -12.059803
episode 3317.000000, reward total was -14.000000. running mean: -12.079205
episode 3318.000000, reward total was -11.000000. running mean: -12.068413
episode 3319.000000, reward total was -8.000000. running mean: -12.027729
episode 3320.000000, reward total was -17.000000. running mean: -12.077451
episode 3321.000000, reward total was -16.000000. running mean: -12.116677
episode 3322.000000, reward total was -9.000000. running mean: -12.085510
episode 3323.000000, reward total was -10.000000. running mean: -12.064655
episode 3324.000000, reward total was -9.000000. running mean: -12.034008
episode 3325.000000, reward 

episode 3422.000000, reward total was -12.000000. running mean: -11.658607
episode 3423.000000, reward total was -16.000000. running mean: -11.702021
episode 3424.000000, reward total was -13.000000. running mean: -11.715000
episode 3425.000000, reward total was -8.000000. running mean: -11.677850
episode 3426.000000, reward total was -14.000000. running mean: -11.701072
episode 3427.000000, reward total was -11.000000. running mean: -11.694061
episode 3428.000000, reward total was -7.000000. running mean: -11.647121
episode 3429.000000, reward total was -10.000000. running mean: -11.630649
episode 3430.000000, reward total was -13.000000. running mean: -11.644343
episode 3431.000000, reward total was -13.000000. running mean: -11.657899
episode 3432.000000, reward total was -15.000000. running mean: -11.691320
episode 3433.000000, reward total was -12.000000. running mean: -11.694407
episode 3434.000000, reward total was -12.000000. running mean: -11.697463
episode 3435.000000, reward

episode 3532.000000, reward total was -12.000000. running mean: -11.764615
episode 3533.000000, reward total was -14.000000. running mean: -11.786969
episode 3534.000000, reward total was -15.000000. running mean: -11.819099
episode 3535.000000, reward total was -10.000000. running mean: -11.800908
episode 3536.000000, reward total was -14.000000. running mean: -11.822899
episode 3537.000000, reward total was -17.000000. running mean: -11.874670
episode 3538.000000, reward total was -11.000000. running mean: -11.865923
episode 3539.000000, reward total was -13.000000. running mean: -11.877264
episode 3540.000000, reward total was -15.000000. running mean: -11.908491
episode 3541.000000, reward total was -10.000000. running mean: -11.889406
episode 3542.000000, reward total was -7.000000. running mean: -11.840512
episode 3543.000000, reward total was -14.000000. running mean: -11.862107
episode 3544.000000, reward total was -3.000000. running mean: -11.773486
episode 3545.000000, reward

episode 3642.000000, reward total was -12.000000. running mean: -11.533925
episode 3643.000000, reward total was -14.000000. running mean: -11.558586
episode 3644.000000, reward total was -9.000000. running mean: -11.533000
episode 3645.000000, reward total was -15.000000. running mean: -11.567670
episode 3646.000000, reward total was -8.000000. running mean: -11.531993
episode 3647.000000, reward total was -11.000000. running mean: -11.526673
episode 3648.000000, reward total was -15.000000. running mean: -11.561407
episode 3649.000000, reward total was -11.000000. running mean: -11.555793
episode 3650.000000, reward total was -12.000000. running mean: -11.560235
episode 3651.000000, reward total was -11.000000. running mean: -11.554632
episode 3652.000000, reward total was -10.000000. running mean: -11.539086
episode 3653.000000, reward total was -3.000000. running mean: -11.453695
episode 3654.000000, reward total was -10.000000. running mean: -11.439158
episode 3655.000000, reward 

episode 3752.000000, reward total was -10.000000. running mean: -11.301427
episode 3753.000000, reward total was -13.000000. running mean: -11.318413
episode 3754.000000, reward total was -10.000000. running mean: -11.305229
episode 3755.000000, reward total was -14.000000. running mean: -11.332176
episode 3756.000000, reward total was -9.000000. running mean: -11.308854
episode 3757.000000, reward total was -10.000000. running mean: -11.295766
episode 3758.000000, reward total was -9.000000. running mean: -11.272808
episode 3759.000000, reward total was -12.000000. running mean: -11.280080
episode 3760.000000, reward total was -17.000000. running mean: -11.337279
episode 3761.000000, reward total was -16.000000. running mean: -11.383907
episode 3762.000000, reward total was -13.000000. running mean: -11.400068
episode 3763.000000, reward total was -9.000000. running mean: -11.376067
episode 3764.000000, reward total was -13.000000. running mean: -11.392306
episode 3765.000000, reward 

episode 3862.000000, reward total was -6.000000. running mean: -11.129750
episode 3863.000000, reward total was -9.000000. running mean: -11.108453
episode 3864.000000, reward total was -12.000000. running mean: -11.117368
episode 3865.000000, reward total was -11.000000. running mean: -11.116194
episode 3866.000000, reward total was -9.000000. running mean: -11.095032
episode 3867.000000, reward total was -12.000000. running mean: -11.104082
episode 3868.000000, reward total was -17.000000. running mean: -11.163041
episode 3869.000000, reward total was -10.000000. running mean: -11.151411
episode 3870.000000, reward total was -10.000000. running mean: -11.139897
episode 3871.000000, reward total was -13.000000. running mean: -11.158498
episode 3872.000000, reward total was -9.000000. running mean: -11.136913
episode 3873.000000, reward total was -8.000000. running mean: -11.105544
episode 3874.000000, reward total was -8.000000. running mean: -11.074488
episode 3875.000000, reward tot

episode 3972.000000, reward total was -13.000000. running mean: -10.293454
episode 3973.000000, reward total was -9.000000. running mean: -10.280520
episode 3974.000000, reward total was -16.000000. running mean: -10.337714
episode 3975.000000, reward total was -11.000000. running mean: -10.344337
episode 3976.000000, reward total was -13.000000. running mean: -10.370894
episode 3977.000000, reward total was -13.000000. running mean: -10.397185
episode 3978.000000, reward total was -9.000000. running mean: -10.383213
episode 3979.000000, reward total was -9.000000. running mean: -10.369381
episode 3980.000000, reward total was -14.000000. running mean: -10.405687
episode 3981.000000, reward total was -8.000000. running mean: -10.381630
episode 3982.000000, reward total was -9.000000. running mean: -10.367814
episode 3983.000000, reward total was -9.000000. running mean: -10.354136
episode 3984.000000, reward total was -10.000000. running mean: -10.350594
episode 3985.000000, reward tot

episode 4082.000000, reward total was -13.000000. running mean: -10.230252
episode 4083.000000, reward total was -9.000000. running mean: -10.217949
episode 4084.000000, reward total was -13.000000. running mean: -10.245770
episode 4085.000000, reward total was -3.000000. running mean: -10.173312
episode 4086.000000, reward total was -12.000000. running mean: -10.191579
episode 4087.000000, reward total was -13.000000. running mean: -10.219663
episode 4088.000000, reward total was -14.000000. running mean: -10.257466
episode 4089.000000, reward total was -10.000000. running mean: -10.254892
episode 4090.000000, reward total was -11.000000. running mean: -10.262343
episode 4091.000000, reward total was -14.000000. running mean: -10.299719
episode 4092.000000, reward total was -12.000000. running mean: -10.316722
episode 4093.000000, reward total was -13.000000. running mean: -10.343555
episode 4094.000000, reward total was -13.000000. running mean: -10.370119
episode 4095.000000, reward

episode 4192.000000, reward total was -9.000000. running mean: -10.642170
episode 4193.000000, reward total was -8.000000. running mean: -10.615749
episode 4194.000000, reward total was -14.000000. running mean: -10.649591
episode 4195.000000, reward total was -15.000000. running mean: -10.693095
episode 4196.000000, reward total was -7.000000. running mean: -10.656164
episode 4197.000000, reward total was -14.000000. running mean: -10.689603
episode 4198.000000, reward total was -13.000000. running mean: -10.712707
episode 4199.000000, reward total was -8.000000. running mean: -10.685580
episode 4200.000000, reward total was -9.000000. running mean: -10.668724
episode 4201.000000, reward total was -9.000000. running mean: -10.652037
episode 4202.000000, reward total was -11.000000. running mean: -10.655516
episode 4203.000000, reward total was -8.000000. running mean: -10.628961
episode 4204.000000, reward total was -3.000000. running mean: -10.552671
episode 4205.000000, reward total

episode 4302.000000, reward total was -17.000000. running mean: -10.614851
episode 4303.000000, reward total was -9.000000. running mean: -10.598702
episode 4304.000000, reward total was -7.000000. running mean: -10.562715
episode 4305.000000, reward total was -13.000000. running mean: -10.587088
episode 4306.000000, reward total was -9.000000. running mean: -10.571217
episode 4307.000000, reward total was -7.000000. running mean: -10.535505
episode 4308.000000, reward total was -17.000000. running mean: -10.600150
episode 4309.000000, reward total was -14.000000. running mean: -10.634149
episode 4310.000000, reward total was -10.000000. running mean: -10.627807
episode 4311.000000, reward total was -9.000000. running mean: -10.611529
episode 4312.000000, reward total was -5.000000. running mean: -10.555414
episode 4313.000000, reward total was -7.000000. running mean: -10.519860
episode 4314.000000, reward total was -13.000000. running mean: -10.544661
episode 4315.000000, reward tota

episode 4412.000000, reward total was -7.000000. running mean: -10.232458
episode 4413.000000, reward total was -12.000000. running mean: -10.250133
episode 4414.000000, reward total was -15.000000. running mean: -10.297632
episode 4415.000000, reward total was -13.000000. running mean: -10.324655
episode 4416.000000, reward total was -14.000000. running mean: -10.361409
episode 4417.000000, reward total was -16.000000. running mean: -10.417795
episode 4418.000000, reward total was -9.000000. running mean: -10.403617
episode 4419.000000, reward total was -14.000000. running mean: -10.439581
episode 4420.000000, reward total was -15.000000. running mean: -10.485185
episode 4421.000000, reward total was -9.000000. running mean: -10.470333
episode 4422.000000, reward total was -15.000000. running mean: -10.515630
episode 4423.000000, reward total was -13.000000. running mean: -10.540473
episode 4424.000000, reward total was -9.000000. running mean: -10.525069
episode 4425.000000, reward t

episode 4522.000000, reward total was -12.000000. running mean: -10.518505
episode 4523.000000, reward total was -4.000000. running mean: -10.453320
episode 4524.000000, reward total was -3.000000. running mean: -10.378786
episode 4525.000000, reward total was -9.000000. running mean: -10.364998
episode 4526.000000, reward total was -9.000000. running mean: -10.351348
episode 4527.000000, reward total was -9.000000. running mean: -10.337835
episode 4528.000000, reward total was -14.000000. running mean: -10.374457
episode 4529.000000, reward total was -7.000000. running mean: -10.340712
episode 4530.000000, reward total was -12.000000. running mean: -10.357305
episode 4531.000000, reward total was -6.000000. running mean: -10.313732
episode 4532.000000, reward total was -11.000000. running mean: -10.320595
episode 4533.000000, reward total was -14.000000. running mean: -10.357389
episode 4534.000000, reward total was -15.000000. running mean: -10.403815
episode 4535.000000, reward tota

episode 4632.000000, reward total was -5.000000. running mean: -10.056835
episode 4633.000000, reward total was -1.000000. running mean: -9.966267
episode 4634.000000, reward total was -8.000000. running mean: -9.946604
episode 4635.000000, reward total was -8.000000. running mean: -9.927138
episode 4636.000000, reward total was -7.000000. running mean: -9.897867
episode 4637.000000, reward total was -13.000000. running mean: -9.928888
episode 4638.000000, reward total was -13.000000. running mean: -9.959599
episode 4639.000000, reward total was -8.000000. running mean: -9.940003
episode 4640.000000, reward total was -13.000000. running mean: -9.970603
episode 4641.000000, reward total was -11.000000. running mean: -9.980897
episode 4642.000000, reward total was -13.000000. running mean: -10.011088
episode 4643.000000, reward total was -13.000000. running mean: -10.040977
episode 4644.000000, reward total was -8.000000. running mean: -10.020568
episode 4645.000000, reward total was -12

episode 4743.000000, reward total was -10.000000. running mean: -10.065761
episode 4744.000000, reward total was -9.000000. running mean: -10.055104
episode 4745.000000, reward total was -12.000000. running mean: -10.074553
episode 4746.000000, reward total was -13.000000. running mean: -10.103807
episode 4747.000000, reward total was -10.000000. running mean: -10.102769
episode 4748.000000, reward total was -9.000000. running mean: -10.091741
episode 4749.000000, reward total was -11.000000. running mean: -10.100824
episode 4750.000000, reward total was -14.000000. running mean: -10.139816
episode 4751.000000, reward total was -12.000000. running mean: -10.158417
episode 4752.000000, reward total was -8.000000. running mean: -10.136833
episode 4753.000000, reward total was -11.000000. running mean: -10.145465
episode 4754.000000, reward total was -13.000000. running mean: -10.174010
episode 4755.000000, reward total was -8.000000. running mean: -10.152270
episode 4756.000000, reward t

episode 4854.000000, reward total was -10.000000. running mean: -9.870100
episode 4855.000000, reward total was -12.000000. running mean: -9.891399
episode 4856.000000, reward total was -7.000000. running mean: -9.862485
episode 4857.000000, reward total was -11.000000. running mean: -9.873860
episode 4858.000000, reward total was -14.000000. running mean: -9.915121
episode 4859.000000, reward total was -12.000000. running mean: -9.935970
episode 4860.000000, reward total was -11.000000. running mean: -9.946610
episode 4861.000000, reward total was -6.000000. running mean: -9.907144
episode 4862.000000, reward total was 1.000000. running mean: -9.798073
episode 4863.000000, reward total was -7.000000. running mean: -9.770092
episode 4864.000000, reward total was -8.000000. running mean: -9.752391
episode 4865.000000, reward total was -10.000000. running mean: -9.754867
episode 4866.000000, reward total was -13.000000. running mean: -9.787318
episode 4867.000000, reward total was -14.00

episode 4965.000000, reward total was -15.000000. running mean: -9.836051
episode 4966.000000, reward total was -8.000000. running mean: -9.817691
episode 4967.000000, reward total was -15.000000. running mean: -9.869514
episode 4968.000000, reward total was -12.000000. running mean: -9.890819
episode 4969.000000, reward total was -10.000000. running mean: -9.891911
episode 4970.000000, reward total was -16.000000. running mean: -9.952992
episode 4971.000000, reward total was -10.000000. running mean: -9.953462
episode 4972.000000, reward total was -13.000000. running mean: -9.983927
episode 4973.000000, reward total was -11.000000. running mean: -9.994088
episode 4974.000000, reward total was -10.000000. running mean: -9.994147
episode 4975.000000, reward total was -12.000000. running mean: -10.014205
episode 4976.000000, reward total was -9.000000. running mean: -10.004063
episode 4977.000000, reward total was -11.000000. running mean: -10.014023
episode 4978.000000, reward total was

episode 5077.000000, reward total was -8.000000. running mean: -9.576414
episode 5078.000000, reward total was -12.000000. running mean: -9.600650
episode 5079.000000, reward total was -8.000000. running mean: -9.584644
episode 5080.000000, reward total was -13.000000. running mean: -9.618797
episode 5081.000000, reward total was -13.000000. running mean: -9.652609
episode 5082.000000, reward total was -13.000000. running mean: -9.686083
episode 5083.000000, reward total was -9.000000. running mean: -9.679222
episode 5084.000000, reward total was -15.000000. running mean: -9.732430
episode 5085.000000, reward total was -9.000000. running mean: -9.725106
episode 5086.000000, reward total was -11.000000. running mean: -9.737855
episode 5087.000000, reward total was -6.000000. running mean: -9.700476
episode 5088.000000, reward total was -6.000000. running mean: -9.663471
episode 5089.000000, reward total was -8.000000. running mean: -9.646837
episode 5090.000000, reward total was -5.0000

episode 5189.000000, reward total was -12.000000. running mean: -9.703665
episode 5190.000000, reward total was -11.000000. running mean: -9.716628
episode 5191.000000, reward total was -13.000000. running mean: -9.749462
episode 5192.000000, reward total was -9.000000. running mean: -9.741967
episode 5193.000000, reward total was -10.000000. running mean: -9.744548
episode 5194.000000, reward total was -7.000000. running mean: -9.717102
episode 5195.000000, reward total was -11.000000. running mean: -9.729931
episode 5196.000000, reward total was -10.000000. running mean: -9.732632
episode 5197.000000, reward total was -8.000000. running mean: -9.715305
episode 5198.000000, reward total was -15.000000. running mean: -9.768152
episode 5199.000000, reward total was -12.000000. running mean: -9.790471
episode 5200.000000, reward total was -9.000000. running mean: -9.782566
episode 5201.000000, reward total was -17.000000. running mean: -9.854740
episode 5202.000000, reward total was -7.0

episode 5301.000000, reward total was -5.000000. running mean: -9.705876
episode 5302.000000, reward total was -13.000000. running mean: -9.738817
episode 5303.000000, reward total was -13.000000. running mean: -9.771429
episode 5304.000000, reward total was -5.000000. running mean: -9.723714
episode 5305.000000, reward total was -5.000000. running mean: -9.676477
episode 5306.000000, reward total was -5.000000. running mean: -9.629713
episode 5307.000000, reward total was -6.000000. running mean: -9.593415
episode 5308.000000, reward total was -7.000000. running mean: -9.567481
episode 5309.000000, reward total was -8.000000. running mean: -9.551806
episode 5310.000000, reward total was -4.000000. running mean: -9.496288
episode 5311.000000, reward total was -11.000000. running mean: -9.511325
episode 5312.000000, reward total was -3.000000. running mean: -9.446212
episode 5313.000000, reward total was -7.000000. running mean: -9.421750
episode 5314.000000, reward total was -6.000000.

episode 5413.000000, reward total was -7.000000. running mean: -9.222944
episode 5414.000000, reward total was -7.000000. running mean: -9.200714
episode 5415.000000, reward total was -9.000000. running mean: -9.198707
episode 5416.000000, reward total was -14.000000. running mean: -9.246720
episode 5417.000000, reward total was -10.000000. running mean: -9.254253
episode 5418.000000, reward total was -5.000000. running mean: -9.211710
episode 5419.000000, reward total was -9.000000. running mean: -9.209593
episode 5420.000000, reward total was -11.000000. running mean: -9.227497
episode 5421.000000, reward total was -14.000000. running mean: -9.275222
episode 5422.000000, reward total was -9.000000. running mean: -9.272470
episode 5423.000000, reward total was -9.000000. running mean: -9.269745
episode 5424.000000, reward total was -12.000000. running mean: -9.297048
episode 5425.000000, reward total was -7.000000. running mean: -9.274077
episode 5426.000000, reward total was -9.00000

episode 5525.000000, reward total was -12.000000. running mean: -8.978535
episode 5526.000000, reward total was -10.000000. running mean: -8.988750
episode 5527.000000, reward total was -13.000000. running mean: -9.028862
episode 5528.000000, reward total was -7.000000. running mean: -9.008574
episode 5529.000000, reward total was -13.000000. running mean: -9.048488
episode 5530.000000, reward total was -16.000000. running mean: -9.118003
episode 5531.000000, reward total was -13.000000. running mean: -9.156823
episode 5532.000000, reward total was -11.000000. running mean: -9.175255
episode 5533.000000, reward total was -13.000000. running mean: -9.213502
episode 5534.000000, reward total was -8.000000. running mean: -9.201367
episode 5535.000000, reward total was -16.000000. running mean: -9.269353
episode 5536.000000, reward total was -11.000000. running mean: -9.286660
episode 5537.000000, reward total was -7.000000. running mean: -9.263793
episode 5538.000000, reward total was -7.

episode 5637.000000, reward total was -7.000000. running mean: -9.527363
episode 5638.000000, reward total was -10.000000. running mean: -9.532090
episode 5639.000000, reward total was -6.000000. running mean: -9.496769
episode 5640.000000, reward total was -10.000000. running mean: -9.501801
episode 5641.000000, reward total was -7.000000. running mean: -9.476783
episode 5642.000000, reward total was -12.000000. running mean: -9.502015
episode 5643.000000, reward total was -7.000000. running mean: -9.476995
episode 5644.000000, reward total was -10.000000. running mean: -9.482225
episode 5645.000000, reward total was -8.000000. running mean: -9.467403
episode 5646.000000, reward total was -12.000000. running mean: -9.492729
episode 5647.000000, reward total was -14.000000. running mean: -9.537802
episode 5648.000000, reward total was -15.000000. running mean: -9.592424
episode 5649.000000, reward total was -13.000000. running mean: -9.626499
episode 5650.000000, reward total was -7.00

episode 5749.000000, reward total was -6.000000. running mean: -8.926039
episode 5750.000000, reward total was -12.000000. running mean: -8.956779
episode 5751.000000, reward total was -5.000000. running mean: -8.917211
episode 5752.000000, reward total was -6.000000. running mean: -8.888039
episode 5753.000000, reward total was -10.000000. running mean: -8.899159
episode 5754.000000, reward total was -12.000000. running mean: -8.930167
episode 5755.000000, reward total was -17.000000. running mean: -9.010865
episode 5756.000000, reward total was -5.000000. running mean: -8.970757
episode 5757.000000, reward total was -13.000000. running mean: -9.011049
episode 5758.000000, reward total was -9.000000. running mean: -9.010939
episode 5759.000000, reward total was -5.000000. running mean: -8.970829
episode 5760.000000, reward total was -13.000000. running mean: -9.011121
episode 5761.000000, reward total was -13.000000. running mean: -9.051010
episode 5762.000000, reward total was -11.00

episode 5861.000000, reward total was -10.000000. running mean: -8.742926
episode 5862.000000, reward total was -7.000000. running mean: -8.725497
episode 5863.000000, reward total was -15.000000. running mean: -8.788242
episode 5864.000000, reward total was -13.000000. running mean: -8.830360
episode 5865.000000, reward total was -14.000000. running mean: -8.882056
episode 5866.000000, reward total was -12.000000. running mean: -8.913235
episode 5867.000000, reward total was -10.000000. running mean: -8.924103
episode 5868.000000, reward total was -4.000000. running mean: -8.874862
episode 5869.000000, reward total was -13.000000. running mean: -8.916113
episode 5870.000000, reward total was -7.000000. running mean: -8.896952
episode 5871.000000, reward total was -11.000000. running mean: -8.917983
episode 5872.000000, reward total was -5.000000. running mean: -8.878803
episode 5873.000000, reward total was -14.000000. running mean: -8.930015
episode 5874.000000, reward total was -13.

episode 5973.000000, reward total was -12.000000. running mean: -8.967271
episode 5974.000000, reward total was -7.000000. running mean: -8.947599
episode 5975.000000, reward total was -7.000000. running mean: -8.928123
episode 5976.000000, reward total was -4.000000. running mean: -8.878841
episode 5977.000000, reward total was -7.000000. running mean: -8.860053
episode 5978.000000, reward total was -7.000000. running mean: -8.841452
episode 5979.000000, reward total was -7.000000. running mean: -8.823038
episode 5980.000000, reward total was -13.000000. running mean: -8.864808
episode 5981.000000, reward total was -9.000000. running mean: -8.866159
episode 5982.000000, reward total was -10.000000. running mean: -8.877498
episode 5983.000000, reward total was -12.000000. running mean: -8.908723
episode 5984.000000, reward total was -16.000000. running mean: -8.979636
episode 5985.000000, reward total was -9.000000. running mean: -8.979839
episode 5986.000000, reward total was -9.00000

In [5]:
play_game(env, model)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


Episode finished without success, accumulated reward = -2.0
