In [1]:
import gym
import numpy as np
rm='Pong-v0'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make(rm)

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  logger.warn(
  deprecation(
  deprecation(


In [3]:
%time hist1 = train_model(env, model, total_episodes=6000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -21.000000. running mean: -21.000000
episode 3.000000, reward total was -21.000000. running mean: -21.000000
episode 4.000000, reward total was -21.000000. running mean: -21.000000
episode 5.000000, reward total was -21.000000. running mean: -21.000000
episode 6.000000, reward total was -20.000000. running mean: -20.990000
episode 7.000000, reward total was -21.000000. running mean: -20.990100
episode 8.000000, reward total was -21.000000. running mean: -20.990199
episode 9.000000, reward total was -18.000000. running mean: -20.960297
episode 10.000000, reward total was -21.000000. running mean: -20.960694
episode 11.000000, reward total was -20.000000. running mean: -20.951087
episode 12.000000, reward total was -19.000000. running mean: -20.931576
episode 13.000000, reward total was -19.000000. running mean: -20.912260
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -20.000000. running mean: -20.350904
episode 115.000000, reward total was -19.000000. running mean: -20.337395
episode 116.000000, reward total was -19.000000. running mean: -20.324021
episode 117.000000, reward total was -21.000000. running mean: -20.330781
episode 118.000000, reward total was -21.000000. running mean: -20.337473
episode 119.000000, reward total was -20.000000. running mean: -20.334098
episode 120.000000, reward total was -19.000000. running mean: -20.320758
episode 121.000000, reward total was -21.000000. running mean: -20.327550
episode 122.000000, reward total was -19.000000. running mean: -20.314274
episode 123.000000, reward total was -20.000000. running mean: -20.311132
episode 124.000000, reward total was -21.000000. running mean: -20.318020
episode 125.000000, reward total was -20.000000. running mean: -20.314840
episode 126.000000, reward total was -19.000000. running mean: -20.301692
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.074312
episode 226.000000, reward total was -18.000000. running mean: -20.053569
episode 227.000000, reward total was -21.000000. running mean: -20.063033
episode 228.000000, reward total was -19.000000. running mean: -20.052403
episode 229.000000, reward total was -19.000000. running mean: -20.041879
episode 230.000000, reward total was -16.000000. running mean: -20.001460
episode 231.000000, reward total was -19.000000. running mean: -19.991445
episode 232.000000, reward total was -19.000000. running mean: -19.981531
episode 233.000000, reward total was -19.000000. running mean: -19.971716
episode 234.000000, reward total was -20.000000. running mean: -19.971999
episode 235.000000, reward total was -19.000000. running mean: -19.962279
episode 236.000000, reward total was -21.000000. running mean: -19.972656
episode 237.000000, reward total was -20.000000. running mean: -19.972929
episode 238.000000, reward total was -

episode 336.000000, reward total was -18.000000. running mean: -19.627505
episode 337.000000, reward total was -20.000000. running mean: -19.631230
episode 338.000000, reward total was -20.000000. running mean: -19.634917
episode 339.000000, reward total was -14.000000. running mean: -19.578568
episode 340.000000, reward total was -20.000000. running mean: -19.582783
episode 341.000000, reward total was -21.000000. running mean: -19.596955
episode 342.000000, reward total was -20.000000. running mean: -19.600985
episode 343.000000, reward total was -21.000000. running mean: -19.614975
episode 344.000000, reward total was -20.000000. running mean: -19.618826
episode 345.000000, reward total was -20.000000. running mean: -19.622637
episode 346.000000, reward total was -19.000000. running mean: -19.616411
episode 347.000000, reward total was -16.000000. running mean: -19.580247
episode 348.000000, reward total was -21.000000. running mean: -19.594444
episode 349.000000, reward total was -

episode 447.000000, reward total was -18.000000. running mean: -19.189463
episode 448.000000, reward total was -19.000000. running mean: -19.187568
episode 449.000000, reward total was -20.000000. running mean: -19.195693
episode 450.000000, reward total was -17.000000. running mean: -19.173736
episode 451.000000, reward total was -20.000000. running mean: -19.181998
episode 452.000000, reward total was -17.000000. running mean: -19.160178
episode 453.000000, reward total was -19.000000. running mean: -19.158577
episode 454.000000, reward total was -21.000000. running mean: -19.176991
episode 455.000000, reward total was -20.000000. running mean: -19.185221
episode 456.000000, reward total was -21.000000. running mean: -19.203369
episode 457.000000, reward total was -21.000000. running mean: -19.221335
episode 458.000000, reward total was -16.000000. running mean: -19.189122
episode 459.000000, reward total was -19.000000. running mean: -19.187230
episode 460.000000, reward total was -

episode 558.000000, reward total was -17.000000. running mean: -18.636368
episode 559.000000, reward total was -19.000000. running mean: -18.640004
episode 560.000000, reward total was -21.000000. running mean: -18.663604
episode 561.000000, reward total was -19.000000. running mean: -18.666968
episode 562.000000, reward total was -20.000000. running mean: -18.680298
episode 563.000000, reward total was -16.000000. running mean: -18.653495
episode 564.000000, reward total was -20.000000. running mean: -18.666960
episode 565.000000, reward total was -19.000000. running mean: -18.670291
episode 566.000000, reward total was -18.000000. running mean: -18.663588
episode 567.000000, reward total was -21.000000. running mean: -18.686952
episode 568.000000, reward total was -19.000000. running mean: -18.690082
episode 569.000000, reward total was -20.000000. running mean: -18.703182
episode 570.000000, reward total was -19.000000. running mean: -18.706150
episode 571.000000, reward total was -

episode 669.000000, reward total was -17.000000. running mean: -18.239648
episode 670.000000, reward total was -19.000000. running mean: -18.247251
episode 671.000000, reward total was -14.000000. running mean: -18.204779
episode 672.000000, reward total was -18.000000. running mean: -18.202731
episode 673.000000, reward total was -21.000000. running mean: -18.230704
episode 674.000000, reward total was -14.000000. running mean: -18.188397
episode 675.000000, reward total was -16.000000. running mean: -18.166513
episode 676.000000, reward total was -20.000000. running mean: -18.184848
episode 677.000000, reward total was -15.000000. running mean: -18.152999
episode 678.000000, reward total was -16.000000. running mean: -18.131469
episode 679.000000, reward total was -18.000000. running mean: -18.130154
episode 680.000000, reward total was -21.000000. running mean: -18.158853
episode 681.000000, reward total was -14.000000. running mean: -18.117264
episode 682.000000, reward total was -

episode 780.000000, reward total was -15.000000. running mean: -17.583444
episode 781.000000, reward total was -18.000000. running mean: -17.587610
episode 782.000000, reward total was -18.000000. running mean: -17.591734
episode 783.000000, reward total was -15.000000. running mean: -17.565816
episode 784.000000, reward total was -18.000000. running mean: -17.570158
episode 785.000000, reward total was -17.000000. running mean: -17.564457
episode 786.000000, reward total was -17.000000. running mean: -17.558812
episode 787.000000, reward total was -18.000000. running mean: -17.563224
episode 788.000000, reward total was -17.000000. running mean: -17.557592
episode 789.000000, reward total was -19.000000. running mean: -17.572016
episode 790.000000, reward total was -13.000000. running mean: -17.526296
episode 791.000000, reward total was -19.000000. running mean: -17.541033
episode 792.000000, reward total was -19.000000. running mean: -17.555622
episode 793.000000, reward total was -

episode 891.000000, reward total was -17.000000. running mean: -17.186037
episode 892.000000, reward total was -17.000000. running mean: -17.184177
episode 893.000000, reward total was -14.000000. running mean: -17.152335
episode 894.000000, reward total was -16.000000. running mean: -17.140811
episode 895.000000, reward total was -14.000000. running mean: -17.109403
episode 896.000000, reward total was -17.000000. running mean: -17.108309
episode 897.000000, reward total was -17.000000. running mean: -17.107226
episode 898.000000, reward total was -14.000000. running mean: -17.076154
episode 899.000000, reward total was -12.000000. running mean: -17.025392
episode 900.000000, reward total was -18.000000. running mean: -17.035139
episode 901.000000, reward total was -18.000000. running mean: -17.044787
episode 902.000000, reward total was -18.000000. running mean: -17.054339
episode 903.000000, reward total was -16.000000. running mean: -17.043796
episode 904.000000, reward total was -

episode 1002.000000, reward total was -18.000000. running mean: -16.790340
episode 1003.000000, reward total was -15.000000. running mean: -16.772437
episode 1004.000000, reward total was -18.000000. running mean: -16.784713
episode 1005.000000, reward total was -19.000000. running mean: -16.806866
episode 1006.000000, reward total was -16.000000. running mean: -16.798797
episode 1007.000000, reward total was -19.000000. running mean: -16.820809
episode 1008.000000, reward total was -17.000000. running mean: -16.822601
episode 1009.000000, reward total was -16.000000. running mean: -16.814375
episode 1010.000000, reward total was -20.000000. running mean: -16.846231
episode 1011.000000, reward total was -10.000000. running mean: -16.777769
episode 1012.000000, reward total was -18.000000. running mean: -16.789991
episode 1013.000000, reward total was -18.000000. running mean: -16.802091
episode 1014.000000, reward total was -12.000000. running mean: -16.754070
episode 1015.000000, rewa

episode 1112.000000, reward total was -14.000000. running mean: -16.485543
episode 1113.000000, reward total was -17.000000. running mean: -16.490687
episode 1114.000000, reward total was -15.000000. running mean: -16.475780
episode 1115.000000, reward total was -14.000000. running mean: -16.451022
episode 1116.000000, reward total was -15.000000. running mean: -16.436512
episode 1117.000000, reward total was -16.000000. running mean: -16.432147
episode 1118.000000, reward total was -19.000000. running mean: -16.457826
episode 1119.000000, reward total was -14.000000. running mean: -16.433247
episode 1120.000000, reward total was -18.000000. running mean: -16.448915
episode 1121.000000, reward total was -16.000000. running mean: -16.444426
episode 1122.000000, reward total was -16.000000. running mean: -16.439981
episode 1123.000000, reward total was -14.000000. running mean: -16.415582
episode 1124.000000, reward total was -17.000000. running mean: -16.421426
episode 1125.000000, rewa

episode 1222.000000, reward total was -20.000000. running mean: -15.577170
episode 1223.000000, reward total was -18.000000. running mean: -15.601399
episode 1224.000000, reward total was -17.000000. running mean: -15.615385
episode 1225.000000, reward total was -18.000000. running mean: -15.639231
episode 1226.000000, reward total was -20.000000. running mean: -15.682838
episode 1227.000000, reward total was -15.000000. running mean: -15.676010
episode 1228.000000, reward total was -12.000000. running mean: -15.639250
episode 1229.000000, reward total was -13.000000. running mean: -15.612857
episode 1230.000000, reward total was -16.000000. running mean: -15.616729
episode 1231.000000, reward total was -14.000000. running mean: -15.600562
episode 1232.000000, reward total was -17.000000. running mean: -15.614556
episode 1233.000000, reward total was -19.000000. running mean: -15.648410
episode 1234.000000, reward total was -18.000000. running mean: -15.671926
episode 1235.000000, rewa

episode 1332.000000, reward total was -8.000000. running mean: -15.487713
episode 1333.000000, reward total was -18.000000. running mean: -15.512836
episode 1334.000000, reward total was -16.000000. running mean: -15.517708
episode 1335.000000, reward total was -15.000000. running mean: -15.512531
episode 1336.000000, reward total was -18.000000. running mean: -15.537406
episode 1337.000000, reward total was -17.000000. running mean: -15.552032
episode 1338.000000, reward total was -13.000000. running mean: -15.526511
episode 1339.000000, reward total was -15.000000. running mean: -15.521246
episode 1340.000000, reward total was -20.000000. running mean: -15.566034
episode 1341.000000, reward total was -14.000000. running mean: -15.550373
episode 1342.000000, reward total was -16.000000. running mean: -15.554870
episode 1343.000000, reward total was -11.000000. running mean: -15.509321
episode 1344.000000, reward total was -17.000000. running mean: -15.524228
episode 1345.000000, rewar

episode 1442.000000, reward total was -14.000000. running mean: -15.194896
episode 1443.000000, reward total was -13.000000. running mean: -15.172947
episode 1444.000000, reward total was -17.000000. running mean: -15.191218
episode 1445.000000, reward total was -13.000000. running mean: -15.169305
episode 1446.000000, reward total was -15.000000. running mean: -15.167612
episode 1447.000000, reward total was -12.000000. running mean: -15.135936
episode 1448.000000, reward total was -13.000000. running mean: -15.114577
episode 1449.000000, reward total was -19.000000. running mean: -15.153431
episode 1450.000000, reward total was -15.000000. running mean: -15.151897
episode 1451.000000, reward total was -13.000000. running mean: -15.130378
episode 1452.000000, reward total was -14.000000. running mean: -15.119074
episode 1453.000000, reward total was -18.000000. running mean: -15.147883
episode 1454.000000, reward total was -17.000000. running mean: -15.166404
episode 1455.000000, rewa

episode 1552.000000, reward total was -12.000000. running mean: -14.688087
episode 1553.000000, reward total was -10.000000. running mean: -14.641206
episode 1554.000000, reward total was -13.000000. running mean: -14.624794
episode 1555.000000, reward total was -14.000000. running mean: -14.618546
episode 1556.000000, reward total was -18.000000. running mean: -14.652360
episode 1557.000000, reward total was -13.000000. running mean: -14.635837
episode 1558.000000, reward total was -18.000000. running mean: -14.669478
episode 1559.000000, reward total was -18.000000. running mean: -14.702784
episode 1560.000000, reward total was -11.000000. running mean: -14.665756
episode 1561.000000, reward total was -12.000000. running mean: -14.639098
episode 1562.000000, reward total was -19.000000. running mean: -14.682707
episode 1563.000000, reward total was -15.000000. running mean: -14.685880
episode 1564.000000, reward total was -12.000000. running mean: -14.659021
episode 1565.000000, rewa

episode 1662.000000, reward total was -9.000000. running mean: -14.245502
episode 1663.000000, reward total was -19.000000. running mean: -14.293047
episode 1664.000000, reward total was -11.000000. running mean: -14.260116
episode 1665.000000, reward total was -15.000000. running mean: -14.267515
episode 1666.000000, reward total was -14.000000. running mean: -14.264840
episode 1667.000000, reward total was -12.000000. running mean: -14.242191
episode 1668.000000, reward total was -12.000000. running mean: -14.219769
episode 1669.000000, reward total was -15.000000. running mean: -14.227572
episode 1670.000000, reward total was -18.000000. running mean: -14.265296
episode 1671.000000, reward total was -18.000000. running mean: -14.302643
episode 1672.000000, reward total was -16.000000. running mean: -14.319617
episode 1673.000000, reward total was -17.000000. running mean: -14.346420
episode 1674.000000, reward total was -5.000000. running mean: -14.252956
episode 1675.000000, reward

episode 1772.000000, reward total was -16.000000. running mean: -13.595243
episode 1773.000000, reward total was -15.000000. running mean: -13.609290
episode 1774.000000, reward total was -15.000000. running mean: -13.623197
episode 1775.000000, reward total was -14.000000. running mean: -13.626965
episode 1776.000000, reward total was -7.000000. running mean: -13.560696
episode 1777.000000, reward total was -4.000000. running mean: -13.465089
episode 1778.000000, reward total was -14.000000. running mean: -13.470438
episode 1779.000000, reward total was -16.000000. running mean: -13.495734
episode 1780.000000, reward total was -14.000000. running mean: -13.500776
episode 1781.000000, reward total was -14.000000. running mean: -13.505768
episode 1782.000000, reward total was -10.000000. running mean: -13.470711
episode 1783.000000, reward total was -10.000000. running mean: -13.436004
episode 1784.000000, reward total was -16.000000. running mean: -13.461644
episode 1785.000000, reward

episode 1882.000000, reward total was -8.000000. running mean: -12.793849
episode 1883.000000, reward total was -11.000000. running mean: -12.775911
episode 1884.000000, reward total was -2.000000. running mean: -12.668152
episode 1885.000000, reward total was -14.000000. running mean: -12.681470
episode 1886.000000, reward total was -15.000000. running mean: -12.704656
episode 1887.000000, reward total was -17.000000. running mean: -12.747609
episode 1888.000000, reward total was -3.000000. running mean: -12.650133
episode 1889.000000, reward total was -15.000000. running mean: -12.673632
episode 1890.000000, reward total was -14.000000. running mean: -12.686895
episode 1891.000000, reward total was -16.000000. running mean: -12.720026
episode 1892.000000, reward total was -11.000000. running mean: -12.702826
episode 1893.000000, reward total was -19.000000. running mean: -12.765798
episode 1894.000000, reward total was -13.000000. running mean: -12.768140
episode 1895.000000, reward 

episode 1992.000000, reward total was -11.000000. running mean: -12.444610
episode 1993.000000, reward total was -15.000000. running mean: -12.470164
episode 1994.000000, reward total was -13.000000. running mean: -12.475462
episode 1995.000000, reward total was -17.000000. running mean: -12.520708
episode 1996.000000, reward total was -11.000000. running mean: -12.505501
episode 1997.000000, reward total was -7.000000. running mean: -12.450446
episode 1998.000000, reward total was -11.000000. running mean: -12.435941
episode 1999.000000, reward total was -13.000000. running mean: -12.441582
episode 2000.000000, reward total was -12.000000. running mean: -12.437166
episode 2001.000000, reward total was -7.000000. running mean: -12.382794
episode 2002.000000, reward total was -17.000000. running mean: -12.428966
episode 2003.000000, reward total was -12.000000. running mean: -12.424677
episode 2004.000000, reward total was -16.000000. running mean: -12.460430
episode 2005.000000, reward

episode 2102.000000, reward total was -5.000000. running mean: -12.213928
episode 2103.000000, reward total was -16.000000. running mean: -12.251789
episode 2104.000000, reward total was -13.000000. running mean: -12.259271
episode 2105.000000, reward total was -14.000000. running mean: -12.276678
episode 2106.000000, reward total was -14.000000. running mean: -12.293911
episode 2107.000000, reward total was -7.000000. running mean: -12.240972
episode 2108.000000, reward total was -13.000000. running mean: -12.248563
episode 2109.000000, reward total was -6.000000. running mean: -12.186077
episode 2110.000000, reward total was -11.000000. running mean: -12.174216
episode 2111.000000, reward total was -13.000000. running mean: -12.182474
episode 2112.000000, reward total was -11.000000. running mean: -12.170649
episode 2113.000000, reward total was -12.000000. running mean: -12.168943
episode 2114.000000, reward total was -11.000000. running mean: -12.157253
episode 2115.000000, reward 

episode 2212.000000, reward total was -14.000000. running mean: -11.816919
episode 2213.000000, reward total was -12.000000. running mean: -11.818750
episode 2214.000000, reward total was -18.000000. running mean: -11.880563
episode 2215.000000, reward total was -6.000000. running mean: -11.821757
episode 2216.000000, reward total was -7.000000. running mean: -11.773539
episode 2217.000000, reward total was -15.000000. running mean: -11.805804
episode 2218.000000, reward total was -10.000000. running mean: -11.787746
episode 2219.000000, reward total was -11.000000. running mean: -11.779869
episode 2220.000000, reward total was -16.000000. running mean: -11.822070
episode 2221.000000, reward total was -16.000000. running mean: -11.863849
episode 2222.000000, reward total was -4.000000. running mean: -11.785211
episode 2223.000000, reward total was -6.000000. running mean: -11.727359
episode 2224.000000, reward total was -14.000000. running mean: -11.750085
episode 2225.000000, reward t

episode 2322.000000, reward total was -17.000000. running mean: -11.044656
episode 2323.000000, reward total was -15.000000. running mean: -11.084210
episode 2324.000000, reward total was -12.000000. running mean: -11.093368
episode 2325.000000, reward total was -16.000000. running mean: -11.142434
episode 2326.000000, reward total was -16.000000. running mean: -11.191010
episode 2327.000000, reward total was -8.000000. running mean: -11.159100
episode 2328.000000, reward total was -8.000000. running mean: -11.127509
episode 2329.000000, reward total was -17.000000. running mean: -11.186234
episode 2330.000000, reward total was -13.000000. running mean: -11.204371
episode 2331.000000, reward total was -12.000000. running mean: -11.212328
episode 2332.000000, reward total was -10.000000. running mean: -11.200204
episode 2333.000000, reward total was -13.000000. running mean: -11.218202
episode 2334.000000, reward total was -6.000000. running mean: -11.166020
episode 2335.000000, reward 

episode 2432.000000, reward total was -9.000000. running mean: -10.716900
episode 2433.000000, reward total was -2.000000. running mean: -10.629731
episode 2434.000000, reward total was -11.000000. running mean: -10.633434
episode 2435.000000, reward total was -12.000000. running mean: -10.647099
episode 2436.000000, reward total was -12.000000. running mean: -10.660628
episode 2437.000000, reward total was -11.000000. running mean: -10.664022
episode 2438.000000, reward total was -14.000000. running mean: -10.697382
episode 2439.000000, reward total was -11.000000. running mean: -10.700408
episode 2440.000000, reward total was -10.000000. running mean: -10.693404
episode 2441.000000, reward total was -11.000000. running mean: -10.696470
episode 2442.000000, reward total was -15.000000. running mean: -10.739505
episode 2443.000000, reward total was -17.000000. running mean: -10.802110
episode 2444.000000, reward total was -11.000000. running mean: -10.804089
episode 2445.000000, reward

episode 2542.000000, reward total was -10.000000. running mean: -9.933243
episode 2543.000000, reward total was -6.000000. running mean: -9.893911
episode 2544.000000, reward total was -9.000000. running mean: -9.884972
episode 2545.000000, reward total was -9.000000. running mean: -9.876122
episode 2546.000000, reward total was -9.000000. running mean: -9.867361
episode 2547.000000, reward total was -11.000000. running mean: -9.878687
episode 2548.000000, reward total was -11.000000. running mean: -9.889900
episode 2549.000000, reward total was -4.000000. running mean: -9.831001
episode 2550.000000, reward total was -11.000000. running mean: -9.842691
episode 2551.000000, reward total was -16.000000. running mean: -9.904264
episode 2552.000000, reward total was -12.000000. running mean: -9.925222
episode 2553.000000, reward total was 3.000000. running mean: -9.795969
episode 2554.000000, reward total was -12.000000. running mean: -9.818010
episode 2555.000000, reward total was -9.0000

episode 2654.000000, reward total was -9.000000. running mean: -9.870289
episode 2655.000000, reward total was -4.000000. running mean: -9.811586
episode 2656.000000, reward total was -6.000000. running mean: -9.773470
episode 2657.000000, reward total was -11.000000. running mean: -9.785736
episode 2658.000000, reward total was -8.000000. running mean: -9.767878
episode 2659.000000, reward total was -9.000000. running mean: -9.760199
episode 2660.000000, reward total was -12.000000. running mean: -9.782597
episode 2661.000000, reward total was -5.000000. running mean: -9.734771
episode 2662.000000, reward total was -13.000000. running mean: -9.767424
episode 2663.000000, reward total was -5.000000. running mean: -9.719749
episode 2664.000000, reward total was -13.000000. running mean: -9.752552
episode 2665.000000, reward total was -15.000000. running mean: -9.805026
episode 2666.000000, reward total was -13.000000. running mean: -9.836976
episode 2667.000000, reward total was -11.000

episode 2766.000000, reward total was -13.000000. running mean: -9.631104
episode 2767.000000, reward total was -7.000000. running mean: -9.604793
episode 2768.000000, reward total was -15.000000. running mean: -9.658745
episode 2769.000000, reward total was -8.000000. running mean: -9.642158
episode 2770.000000, reward total was 1.000000. running mean: -9.535736
episode 2771.000000, reward total was -15.000000. running mean: -9.590379
episode 2772.000000, reward total was -7.000000. running mean: -9.564475
episode 2773.000000, reward total was -8.000000. running mean: -9.548830
episode 2774.000000, reward total was -10.000000. running mean: -9.553342
episode 2775.000000, reward total was -15.000000. running mean: -9.607809
episode 2776.000000, reward total was -13.000000. running mean: -9.641731
episode 2777.000000, reward total was -8.000000. running mean: -9.625313
episode 2778.000000, reward total was -4.000000. running mean: -9.569060
episode 2779.000000, reward total was -6.00000

episode 2878.000000, reward total was -17.000000. running mean: -9.360420
episode 2879.000000, reward total was -15.000000. running mean: -9.416816
episode 2880.000000, reward total was -15.000000. running mean: -9.472648
episode 2881.000000, reward total was -11.000000. running mean: -9.487921
episode 2882.000000, reward total was -9.000000. running mean: -9.483042
episode 2883.000000, reward total was -8.000000. running mean: -9.468212
episode 2884.000000, reward total was -14.000000. running mean: -9.513530
episode 2885.000000, reward total was -9.000000. running mean: -9.508394
episode 2886.000000, reward total was -11.000000. running mean: -9.523310
episode 2887.000000, reward total was -12.000000. running mean: -9.548077
episode 2888.000000, reward total was -12.000000. running mean: -9.572596
episode 2889.000000, reward total was -12.000000. running mean: -9.596870
episode 2890.000000, reward total was -13.000000. running mean: -9.630902
episode 2891.000000, reward total was -12

episode 2990.000000, reward total was -3.000000. running mean: -9.550732
episode 2991.000000, reward total was -10.000000. running mean: -9.555224
episode 2992.000000, reward total was -6.000000. running mean: -9.519672
episode 2993.000000, reward total was -10.000000. running mean: -9.524475
episode 2994.000000, reward total was -11.000000. running mean: -9.539231
episode 2995.000000, reward total was -15.000000. running mean: -9.593838
episode 2996.000000, reward total was -15.000000. running mean: -9.647900
episode 2997.000000, reward total was -9.000000. running mean: -9.641421
episode 2998.000000, reward total was -14.000000. running mean: -9.685007
episode 2999.000000, reward total was -5.000000. running mean: -9.638157
episode 3000.000000, reward total was -15.000000. running mean: -9.691775
episode 3001.000000, reward total was -6.000000. running mean: -9.654857
episode 3002.000000, reward total was -12.000000. running mean: -9.678309
episode 3003.000000, reward total was -7.00

episode 3102.000000, reward total was -4.000000. running mean: -8.897380
episode 3103.000000, reward total was 7.000000. running mean: -8.738407
episode 3104.000000, reward total was -13.000000. running mean: -8.781023
episode 3105.000000, reward total was -3.000000. running mean: -8.723212
episode 3106.000000, reward total was -6.000000. running mean: -8.695980
episode 3107.000000, reward total was -7.000000. running mean: -8.679020
episode 3108.000000, reward total was -8.000000. running mean: -8.672230
episode 3109.000000, reward total was -16.000000. running mean: -8.745508
episode 3110.000000, reward total was -10.000000. running mean: -8.758053
episode 3111.000000, reward total was -11.000000. running mean: -8.780472
episode 3112.000000, reward total was -13.000000. running mean: -8.822668
episode 3113.000000, reward total was -16.000000. running mean: -8.894441
episode 3114.000000, reward total was 2.000000. running mean: -8.785496
episode 3115.000000, reward total was -15.00000

episode 3214.000000, reward total was -5.000000. running mean: -8.792540
episode 3215.000000, reward total was -10.000000. running mean: -8.804615
episode 3216.000000, reward total was -6.000000. running mean: -8.776568
episode 3217.000000, reward total was -10.000000. running mean: -8.788803
episode 3218.000000, reward total was -3.000000. running mean: -8.730915
episode 3219.000000, reward total was -15.000000. running mean: -8.793606
episode 3220.000000, reward total was -13.000000. running mean: -8.835670
episode 3221.000000, reward total was -7.000000. running mean: -8.817313
episode 3222.000000, reward total was -12.000000. running mean: -8.849140
episode 3223.000000, reward total was -8.000000. running mean: -8.840648
episode 3224.000000, reward total was -13.000000. running mean: -8.882242
episode 3225.000000, reward total was -14.000000. running mean: -8.933419
episode 3226.000000, reward total was -15.000000. running mean: -8.994085
episode 3227.000000, reward total was -5.00

episode 3326.000000, reward total was -12.000000. running mean: -8.507447
episode 3327.000000, reward total was -10.000000. running mean: -8.522372
episode 3328.000000, reward total was -14.000000. running mean: -8.577148
episode 3329.000000, reward total was -7.000000. running mean: -8.561377
episode 3330.000000, reward total was -13.000000. running mean: -8.605763
episode 3331.000000, reward total was -5.000000. running mean: -8.569706
episode 3332.000000, reward total was -3.000000. running mean: -8.514009
episode 3333.000000, reward total was -2.000000. running mean: -8.448868
episode 3334.000000, reward total was 6.000000. running mean: -8.304380
episode 3335.000000, reward total was -14.000000. running mean: -8.361336
episode 3336.000000, reward total was -17.000000. running mean: -8.447723
episode 3337.000000, reward total was -7.000000. running mean: -8.433245
episode 3338.000000, reward total was -13.000000. running mean: -8.478913
episode 3339.000000, reward total was -8.0000

episode 3438.000000, reward total was -3.000000. running mean: -7.140416
episode 3439.000000, reward total was -12.000000. running mean: -7.189012
episode 3440.000000, reward total was -11.000000. running mean: -7.227122
episode 3441.000000, reward total was -5.000000. running mean: -7.204851
episode 3442.000000, reward total was -14.000000. running mean: -7.272802
episode 3443.000000, reward total was -15.000000. running mean: -7.350074
episode 3444.000000, reward total was -2.000000. running mean: -7.296573
episode 3445.000000, reward total was -5.000000. running mean: -7.273608
episode 3446.000000, reward total was -3.000000. running mean: -7.230872
episode 3447.000000, reward total was -8.000000. running mean: -7.238563
episode 3448.000000, reward total was -10.000000. running mean: -7.266177
episode 3449.000000, reward total was -6.000000. running mean: -7.253515
episode 3450.000000, reward total was -13.000000. running mean: -7.310980
episode 3451.000000, reward total was -11.000

episode 3550.000000, reward total was -9.000000. running mean: -6.839751
episode 3551.000000, reward total was -10.000000. running mean: -6.871354
episode 3552.000000, reward total was -3.000000. running mean: -6.832640
episode 3553.000000, reward total was -2.000000. running mean: -6.784314
episode 3554.000000, reward total was -7.000000. running mean: -6.786471
episode 3555.000000, reward total was -11.000000. running mean: -6.828606
episode 3556.000000, reward total was -2.000000. running mean: -6.780320
episode 3557.000000, reward total was -7.000000. running mean: -6.782517
episode 3558.000000, reward total was -17.000000. running mean: -6.884692
episode 3559.000000, reward total was -2.000000. running mean: -6.835845
episode 3560.000000, reward total was -1.000000. running mean: -6.777486
episode 3561.000000, reward total was 2.000000. running mean: -6.689711
episode 3562.000000, reward total was -9.000000. running mean: -6.712814
episode 3563.000000, reward total was -8.000000. 

episode 3662.000000, reward total was -6.000000. running mean: -6.798530
episode 3663.000000, reward total was 3.000000. running mean: -6.700544
episode 3664.000000, reward total was -15.000000. running mean: -6.783539
episode 3665.000000, reward total was -13.000000. running mean: -6.845703
episode 3666.000000, reward total was -13.000000. running mean: -6.907246
episode 3667.000000, reward total was -11.000000. running mean: -6.948174
episode 3668.000000, reward total was -16.000000. running mean: -7.038692
episode 3669.000000, reward total was -6.000000. running mean: -7.028305
episode 3670.000000, reward total was -5.000000. running mean: -7.008022
episode 3671.000000, reward total was -1.000000. running mean: -6.947942
episode 3672.000000, reward total was -6.000000. running mean: -6.938463
episode 3673.000000, reward total was -3.000000. running mean: -6.899078
episode 3674.000000, reward total was 3.000000. running mean: -6.800087
episode 3675.000000, reward total was 2.000000. 

episode 3774.000000, reward total was -10.000000. running mean: -7.040042
episode 3775.000000, reward total was -9.000000. running mean: -7.059641
episode 3776.000000, reward total was -13.000000. running mean: -7.119045
episode 3777.000000, reward total was -8.000000. running mean: -7.127854
episode 3778.000000, reward total was -2.000000. running mean: -7.076576
episode 3779.000000, reward total was -3.000000. running mean: -7.035810
episode 3780.000000, reward total was -8.000000. running mean: -7.045452
episode 3781.000000, reward total was -14.000000. running mean: -7.114998
episode 3782.000000, reward total was -2.000000. running mean: -7.063848
episode 3783.000000, reward total was -6.000000. running mean: -7.053209
episode 3784.000000, reward total was -1.000000. running mean: -6.992677
episode 3785.000000, reward total was -2.000000. running mean: -6.942750
episode 3786.000000, reward total was -12.000000. running mean: -6.993323
episode 3787.000000, reward total was -10.00000

episode 3886.000000, reward total was -9.000000. running mean: -6.986691
episode 3887.000000, reward total was 6.000000. running mean: -6.856824
episode 3888.000000, reward total was -5.000000. running mean: -6.838255
episode 3889.000000, reward total was -5.000000. running mean: -6.819873
episode 3890.000000, reward total was 2.000000. running mean: -6.731674
episode 3891.000000, reward total was -10.000000. running mean: -6.764357
episode 3892.000000, reward total was -5.000000. running mean: -6.746714
episode 3893.000000, reward total was -6.000000. running mean: -6.739247
episode 3894.000000, reward total was -9.000000. running mean: -6.761854
episode 3895.000000, reward total was -3.000000. running mean: -6.724236
episode 3896.000000, reward total was -3.000000. running mean: -6.686993
episode 3897.000000, reward total was -7.000000. running mean: -6.690123
episode 3898.000000, reward total was -4.000000. running mean: -6.663222
episode 3899.000000, reward total was -12.000000. ru

episode 3999.000000, reward total was -12.000000. running mean: -6.236087
episode 4000.000000, reward total was -2.000000. running mean: -6.193726
episode 4001.000000, reward total was -9.000000. running mean: -6.221788
episode 4002.000000, reward total was -13.000000. running mean: -6.289571
episode 4003.000000, reward total was -5.000000. running mean: -6.276675
episode 4004.000000, reward total was -6.000000. running mean: -6.273908
episode 4005.000000, reward total was -12.000000. running mean: -6.331169
episode 4006.000000, reward total was -3.000000. running mean: -6.297857
episode 4007.000000, reward total was -4.000000. running mean: -6.274879
episode 4008.000000, reward total was -9.000000. running mean: -6.302130
episode 4009.000000, reward total was 4.000000. running mean: -6.199109
episode 4010.000000, reward total was -8.000000. running mean: -6.217118
episode 4011.000000, reward total was -11.000000. running mean: -6.264946
episode 4012.000000, reward total was -7.000000.

episode 4112.000000, reward total was -3.000000. running mean: -5.822879
episode 4113.000000, reward total was -13.000000. running mean: -5.894651
episode 4114.000000, reward total was 3.000000. running mean: -5.805704
episode 4115.000000, reward total was -8.000000. running mean: -5.827647
episode 4116.000000, reward total was -7.000000. running mean: -5.839371
episode 4117.000000, reward total was -11.000000. running mean: -5.890977
episode 4118.000000, reward total was -8.000000. running mean: -5.912067
episode 4119.000000, reward total was 3.000000. running mean: -5.822946
episode 4120.000000, reward total was -10.000000. running mean: -5.864717
episode 4121.000000, reward total was -8.000000. running mean: -5.886070
episode 4122.000000, reward total was -6.000000. running mean: -5.887209
episode 4123.000000, reward total was -11.000000. running mean: -5.938337
episode 4124.000000, reward total was -4.000000. running mean: -5.918954
episode 4125.000000, reward total was 7.000000. r

episode 4224.000000, reward total was 3.000000. running mean: -6.026135
episode 4225.000000, reward total was 2.000000. running mean: -5.945873
episode 4226.000000, reward total was -18.000000. running mean: -6.066415
episode 4227.000000, reward total was -5.000000. running mean: -6.055751
episode 4228.000000, reward total was 2.000000. running mean: -5.975193
episode 4229.000000, reward total was -7.000000. running mean: -5.985441
episode 4230.000000, reward total was -5.000000. running mean: -5.975587
episode 4231.000000, reward total was -13.000000. running mean: -6.045831
episode 4232.000000, reward total was -15.000000. running mean: -6.135373
episode 4233.000000, reward total was -6.000000. running mean: -6.134019
episode 4234.000000, reward total was -10.000000. running mean: -6.172679
episode 4235.000000, reward total was -1.000000. running mean: -6.120952
episode 4236.000000, reward total was 3.000000. running mean: -6.029742
episode 4237.000000, reward total was -11.000000. r

episode 4337.000000, reward total was -6.000000. running mean: -5.456566
episode 4338.000000, reward total was 3.000000. running mean: -5.372000
episode 4339.000000, reward total was 9.000000. running mean: -5.228280
episode 4340.000000, reward total was -4.000000. running mean: -5.215998
episode 4341.000000, reward total was -6.000000. running mean: -5.223838
episode 4342.000000, reward total was -7.000000. running mean: -5.241599
episode 4343.000000, reward total was -4.000000. running mean: -5.229183
episode 4344.000000, reward total was -1.000000. running mean: -5.186891
episode 4345.000000, reward total was -17.000000. running mean: -5.305022
episode 4346.000000, reward total was -6.000000. running mean: -5.311972
episode 4347.000000, reward total was -11.000000. running mean: -5.368852
episode 4348.000000, reward total was -9.000000. running mean: -5.405164
episode 4349.000000, reward total was -3.000000. running mean: -5.381112
episode 4350.000000, reward total was -13.000000. r

episode 4450.000000, reward total was 2.000000. running mean: -4.735176
episode 4451.000000, reward total was -9.000000. running mean: -4.777824
episode 4452.000000, reward total was -3.000000. running mean: -4.760046
episode 4453.000000, reward total was -5.000000. running mean: -4.762445
episode 4454.000000, reward total was -8.000000. running mean: -4.794821
episode 4455.000000, reward total was 8.000000. running mean: -4.666873
episode 4456.000000, reward total was -13.000000. running mean: -4.750204
episode 4457.000000, reward total was -10.000000. running mean: -4.802702
episode 4458.000000, reward total was 1.000000. running mean: -4.744675
episode 4459.000000, reward total was -6.000000. running mean: -4.757228
episode 4460.000000, reward total was -13.000000. running mean: -4.839656
episode 4461.000000, reward total was -13.000000. running mean: -4.921259
episode 4462.000000, reward total was -7.000000. running mean: -4.942047
episode 4463.000000, reward total was -3.000000. r

episode 4563.000000, reward total was -9.000000. running mean: -4.556507
episode 4564.000000, reward total was -13.000000. running mean: -4.640942
episode 4565.000000, reward total was -9.000000. running mean: -4.684533
episode 4566.000000, reward total was -6.000000. running mean: -4.697687
episode 4567.000000, reward total was -6.000000. running mean: -4.710710
episode 4568.000000, reward total was 7.000000. running mean: -4.593603
episode 4569.000000, reward total was -9.000000. running mean: -4.637667
episode 4570.000000, reward total was 1.000000. running mean: -4.581291
episode 4571.000000, reward total was -8.000000. running mean: -4.615478
episode 4572.000000, reward total was -5.000000. running mean: -4.619323
episode 4573.000000, reward total was 9.000000. running mean: -4.483130
episode 4574.000000, reward total was -9.000000. running mean: -4.528298
episode 4575.000000, reward total was -5.000000. running mean: -4.533015
episode 4576.000000, reward total was -6.000000. runn

episode 4676.000000, reward total was -11.000000. running mean: -4.800167
episode 4677.000000, reward total was -8.000000. running mean: -4.832165
episode 4678.000000, reward total was -8.000000. running mean: -4.863844
episode 4679.000000, reward total was 3.000000. running mean: -4.785205
episode 4680.000000, reward total was 2.000000. running mean: -4.717353
episode 4681.000000, reward total was -3.000000. running mean: -4.700180
episode 4682.000000, reward total was -2.000000. running mean: -4.673178
episode 4683.000000, reward total was -12.000000. running mean: -4.746446
episode 4684.000000, reward total was -3.000000. running mean: -4.728982
episode 4685.000000, reward total was -8.000000. running mean: -4.761692
episode 4686.000000, reward total was -5.000000. running mean: -4.764075
episode 4687.000000, reward total was -14.000000. running mean: -4.856434
episode 4688.000000, reward total was 2.000000. running mean: -4.787870
episode 4689.000000, reward total was -3.000000. ru

episode 4789.000000, reward total was -5.000000. running mean: -4.703287
episode 4790.000000, reward total was -7.000000. running mean: -4.726254
episode 4791.000000, reward total was 11.000000. running mean: -4.568991
episode 4792.000000, reward total was 4.000000. running mean: -4.483301
episode 4793.000000, reward total was -8.000000. running mean: -4.518468
episode 4794.000000, reward total was -7.000000. running mean: -4.543284
episode 4795.000000, reward total was -8.000000. running mean: -4.577851
episode 4796.000000, reward total was -8.000000. running mean: -4.612072
episode 4797.000000, reward total was -3.000000. running mean: -4.595952
episode 4798.000000, reward total was -7.000000. running mean: -4.619992
episode 4799.000000, reward total was -6.000000. running mean: -4.633792
episode 4800.000000, reward total was -5.000000. running mean: -4.637454
episode 4801.000000, reward total was -4.000000. running mean: -4.631080
episode 4802.000000, reward total was 0.000000. runn

episode 4902.000000, reward total was 3.000000. running mean: -4.308588
episode 4903.000000, reward total was 3.000000. running mean: -4.235502
episode 4904.000000, reward total was -8.000000. running mean: -4.273147
episode 4905.000000, reward total was -13.000000. running mean: -4.360416
episode 4906.000000, reward total was 5.000000. running mean: -4.266812
episode 4907.000000, reward total was -7.000000. running mean: -4.294143
episode 4908.000000, reward total was -9.000000. running mean: -4.341202
episode 4909.000000, reward total was -8.000000. running mean: -4.377790
episode 4910.000000, reward total was -3.000000. running mean: -4.364012
episode 4911.000000, reward total was 1.000000. running mean: -4.310372
episode 4912.000000, reward total was -9.000000. running mean: -4.357268
episode 4913.000000, reward total was -1.000000. running mean: -4.323696
episode 4914.000000, reward total was -11.000000. running mean: -4.390459
episode 4915.000000, reward total was -6.000000. runn

episode 5015.000000, reward total was -3.000000. running mean: -2.688564
episode 5016.000000, reward total was -11.000000. running mean: -2.771678
episode 5017.000000, reward total was -7.000000. running mean: -2.813962
episode 5018.000000, reward total was 9.000000. running mean: -2.695822
episode 5019.000000, reward total was 1.000000. running mean: -2.658864
episode 5020.000000, reward total was -17.000000. running mean: -2.802275
episode 5021.000000, reward total was -3.000000. running mean: -2.804252
episode 5022.000000, reward total was -2.000000. running mean: -2.796210
episode 5023.000000, reward total was -7.000000. running mean: -2.838248
episode 5024.000000, reward total was -3.000000. running mean: -2.839865
episode 5025.000000, reward total was -7.000000. running mean: -2.881467
episode 5026.000000, reward total was 2.000000. running mean: -2.832652
episode 5027.000000, reward total was 9.000000. running mean: -2.714325
episode 5028.000000, reward total was -6.000000. runn

episode 5128.000000, reward total was -11.000000. running mean: -3.546809
episode 5129.000000, reward total was -9.000000. running mean: -3.601341
episode 5130.000000, reward total was -2.000000. running mean: -3.585328
episode 5131.000000, reward total was -6.000000. running mean: -3.609474
episode 5132.000000, reward total was -5.000000. running mean: -3.623380
episode 5133.000000, reward total was -2.000000. running mean: -3.607146
episode 5134.000000, reward total was 2.000000. running mean: -3.551074
episode 5135.000000, reward total was -8.000000. running mean: -3.595564
episode 5136.000000, reward total was 2.000000. running mean: -3.539608
episode 5137.000000, reward total was -7.000000. running mean: -3.574212
episode 5138.000000, reward total was -5.000000. running mean: -3.588470
episode 5139.000000, reward total was -4.000000. running mean: -3.592585
episode 5140.000000, reward total was -1.000000. running mean: -3.566659
episode 5141.000000, reward total was 1.000000. runn

episode 5241.000000, reward total was -2.000000. running mean: -3.594457
episode 5242.000000, reward total was 6.000000. running mean: -3.498512
episode 5243.000000, reward total was 1.000000. running mean: -3.453527
episode 5244.000000, reward total was -7.000000. running mean: -3.488992
episode 5245.000000, reward total was -5.000000. running mean: -3.504102
episode 5246.000000, reward total was -1.000000. running mean: -3.479061
episode 5247.000000, reward total was -13.000000. running mean: -3.574270
episode 5248.000000, reward total was 2.000000. running mean: -3.518528
episode 5249.000000, reward total was -7.000000. running mean: -3.553342
episode 5250.000000, reward total was 8.000000. running mean: -3.437809
episode 5251.000000, reward total was -14.000000. running mean: -3.543431
episode 5252.000000, reward total was -1.000000. running mean: -3.517997
episode 5253.000000, reward total was 2.000000. running mean: -3.462817
episode 5254.000000, reward total was -3.000000. runni

episode 5354.000000, reward total was 10.000000. running mean: -3.715778
episode 5355.000000, reward total was -3.000000. running mean: -3.708621
episode 5356.000000, reward total was -10.000000. running mean: -3.771534
episode 5357.000000, reward total was -12.000000. running mean: -3.853819
episode 5358.000000, reward total was 7.000000. running mean: -3.745281
episode 5359.000000, reward total was -8.000000. running mean: -3.787828
episode 5360.000000, reward total was -1.000000. running mean: -3.759950
episode 5361.000000, reward total was -8.000000. running mean: -3.802350
episode 5362.000000, reward total was -11.000000. running mean: -3.874327
episode 5363.000000, reward total was -2.000000. running mean: -3.855583
episode 5364.000000, reward total was -7.000000. running mean: -3.887028
episode 5365.000000, reward total was -5.000000. running mean: -3.898157
episode 5366.000000, reward total was 7.000000. running mean: -3.789176
episode 5367.000000, reward total was -3.000000. r

episode 5467.000000, reward total was -1.000000. running mean: -4.180004
episode 5468.000000, reward total was -11.000000. running mean: -4.248204
episode 5469.000000, reward total was 5.000000. running mean: -4.155722
episode 5470.000000, reward total was -3.000000. running mean: -4.144165
episode 5471.000000, reward total was -4.000000. running mean: -4.142723
episode 5472.000000, reward total was -6.000000. running mean: -4.161296
episode 5473.000000, reward total was -2.000000. running mean: -4.139683
episode 5474.000000, reward total was 5.000000. running mean: -4.048286
episode 5475.000000, reward total was 5.000000. running mean: -3.957804
episode 5476.000000, reward total was 2.000000. running mean: -3.898226
episode 5477.000000, reward total was 9.000000. running mean: -3.769243
episode 5478.000000, reward total was -8.000000. running mean: -3.811551
episode 5479.000000, reward total was 10.000000. running mean: -3.673435
episode 5480.000000, reward total was -9.000000. runnin

episode 5580.000000, reward total was 1.000000. running mean: -2.502850
episode 5581.000000, reward total was -6.000000. running mean: -2.537821
episode 5582.000000, reward total was -3.000000. running mean: -2.542443
episode 5583.000000, reward total was 1.000000. running mean: -2.507019
episode 5584.000000, reward total was -2.000000. running mean: -2.501948
episode 5585.000000, reward total was 3.000000. running mean: -2.446929
episode 5586.000000, reward total was 1.000000. running mean: -2.412460
episode 5587.000000, reward total was -7.000000. running mean: -2.458335
episode 5588.000000, reward total was 1.000000. running mean: -2.423752
episode 5589.000000, reward total was 10.000000. running mean: -2.299514
episode 5590.000000, reward total was 2.000000. running mean: -2.256519
episode 5591.000000, reward total was -3.000000. running mean: -2.263954
episode 5592.000000, reward total was 10.000000. running mean: -2.141314
episode 5593.000000, reward total was 18.000000. running 

episode 5693.000000, reward total was 0.000000. running mean: -1.737634
episode 5694.000000, reward total was -5.000000. running mean: -1.770258
episode 5695.000000, reward total was -13.000000. running mean: -1.882555
episode 5696.000000, reward total was 3.000000. running mean: -1.833729
episode 5697.000000, reward total was -6.000000. running mean: -1.875392
episode 5698.000000, reward total was 3.000000. running mean: -1.826638
episode 5699.000000, reward total was -6.000000. running mean: -1.868372
episode 5700.000000, reward total was 5.000000. running mean: -1.799688
episode 5701.000000, reward total was 1.000000. running mean: -1.771691
episode 5702.000000, reward total was 7.000000. running mean: -1.683974
episode 5703.000000, reward total was -7.000000. running mean: -1.737135
episode 5704.000000, reward total was 2.000000. running mean: -1.699763
episode 5705.000000, reward total was -2.000000. running mean: -1.702766
episode 5706.000000, reward total was -7.000000. running 

episode 5806.000000, reward total was -2.000000. running mean: -1.735111
episode 5807.000000, reward total was 7.000000. running mean: -1.647760
episode 5808.000000, reward total was -3.000000. running mean: -1.661282
episode 5809.000000, reward total was -3.000000. running mean: -1.674670
episode 5810.000000, reward total was -2.000000. running mean: -1.677923
episode 5811.000000, reward total was -3.000000. running mean: -1.691144
episode 5812.000000, reward total was 1.000000. running mean: -1.664232
episode 5813.000000, reward total was -6.000000. running mean: -1.707590
episode 5814.000000, reward total was -6.000000. running mean: -1.750514
episode 5815.000000, reward total was 2.000000. running mean: -1.713009
episode 5816.000000, reward total was -10.000000. running mean: -1.795879
episode 5817.000000, reward total was -5.000000. running mean: -1.827920
episode 5818.000000, reward total was -3.000000. running mean: -1.839641
episode 5819.000000, reward total was 2.000000. runni

episode 5919.000000, reward total was -11.000000. running mean: -1.830679
episode 5920.000000, reward total was -15.000000. running mean: -1.962373
episode 5921.000000, reward total was -1.000000. running mean: -1.952749
episode 5922.000000, reward total was -9.000000. running mean: -2.023221
episode 5923.000000, reward total was 3.000000. running mean: -1.972989
episode 5924.000000, reward total was -1.000000. running mean: -1.963259
episode 5925.000000, reward total was -7.000000. running mean: -2.013627
episode 5926.000000, reward total was 11.000000. running mean: -1.883490
episode 5927.000000, reward total was -6.000000. running mean: -1.924655
episode 5928.000000, reward total was 1.000000. running mean: -1.895409
episode 5929.000000, reward total was 4.000000. running mean: -1.836455
episode 5930.000000, reward total was -6.000000. running mean: -1.878090
episode 5931.000000, reward total was -1.000000. running mean: -1.869309
episode 5932.000000, reward total was 5.000000. runn

In [4]:
play_game(env, model)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


Episode finished without success, accumulated reward = 1.0
