In [None]:
import gym
from gym.wrappers import Monitor
import itertools
import numpy as np
import os
import random
import sys
import psutil
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from SERI import StateProcessor, Estimator, ModelParametersCopier, make_epsilon_greedy_policy, deep_q_learning

from reinforcementlearning.lib import plotting
from collections import deque, namedtuple

env = gym.envs.make("Breakout-v0")

VALID_ACTIONS = [0, 1, 2, 3]

# training
tf.reset_default_graph()

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments_seri2/{}".format(env.spec.id))

# Create a glboal step variable
global_step = tf.Variable(0, name='global_step', trainable=False)

# Create estimators
q_estimator = Estimator(scope="q_estimator", summaries_dir=experiment_dir)
target_estimator = Estimator(scope="target_q")

# State processor
state_processor = StateProcessor()

results = []

# Run it!
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                    env,
                                    q_estimator=q_estimator,
                                    target_estimator=target_estimator,
                                    state_processor=state_processor,
                                    experiment_dir=experiment_dir,
                                    num_episodes=10000,
                                    replay_memory_size=500000,
                                    replay_memory_init_size=200000,
                                    update_target_estimator_every=10000,
                                    epsilon_start=1.0,
                                    epsilon_end=0.1,
                                    epsilon_decay_steps=500000,
                                    discount_factor=0.99,
                                    batch_size=40,
                                    ser_coef=32):
        results.append(stats.episode_rewards[-1])
        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))




Instructions for updating:
Use `tf.cast` instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.flatten instead.


Instructions for updating:
Please switch to tf.train.get_global_step
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor







Populating replay memory...
Mean reward in replay buffer: 1.3582089552238805 STD: 1.2771236338426295
Target mean reward: 3.9134562229091396
Secondary experience collection...
Last reward: 5.0
Mean reward in replay buffer: 1.3819095477386936 STD: 1.29680040

Last reward: 3.0
Mean reward in replay buffer: 1.7868852459016393 STD: 1.17363089417846
Last reward: 0.0
Mean reward in replay buffer: 1.7868852459016393 STD: 1.17363089417846
Last reward: 1.0
Mean reward in replay buffer: 1.7923497267759563 STD: 1.1675777725878194
Last reward: 1.0
Mean reward in replay buffer: 1.8076923076923077 STD: 1.1569373524089648
Last reward: 0.0
Mean reward in replay buffer: 1.8076923076923077 STD: 1.1569373524089648
Last reward: 1.0
Mean reward in replay buffer: 1.8131868131868132 STD: 1.1506624362757683
Last reward: 0.0
Mean reward in replay buffer: 1.8131868131868132 STD: 1.1506624362757683
Last reward: 1.0
Mean reward in replay buffer: 1.8186813186813187 STD: 1.1443265855275813
Last reward: 3.0
Mean reward in replay buffer: 1.8555555555555556 STD: 1.129207079406277
Last reward: 1.0
Mean reward in replay buffer: 1.8611111111111112 STD: 1.1224668010364165
Last reward: 1.0
Mean reward in replay buffer: 1.8666666666666667 STD: 1.1156579840749439
Last reward: 0.

Last reward: 1.0
Mean reward in replay buffer: 2.2951807228915664 STD: 1.0632601226197724
Last reward: 3.0
Mean reward in replay buffer: 2.3072289156626504 STD: 1.0598208119020098
Last reward: 1.0
Mean reward in replay buffer: 2.3072289156626504 STD: 1.0598208119020098
Last reward: 0.0
Mean reward in replay buffer: 2.3072289156626504 STD: 1.0598208119020098
Last reward: 1.0
Mean reward in replay buffer: 2.3072289156626504 STD: 1.0598208119020098
Last reward: 0.0
Mean reward in replay buffer: 2.3072289156626504 STD: 1.0598208119020098
Last reward: 1.0
Mean reward in replay buffer: 2.3072289156626504 STD: 1.0598208119020098
Last reward: 3.0
Mean reward in replay buffer: 2.327272727272727 STD: 1.0543963500369182
Last reward: 0.0
Mean reward in replay buffer: 2.327272727272727 STD: 1.0543963500369182
Last reward: 3.0
Mean reward in replay buffer: 2.3393939393939394 STD: 1.0505339507461726
Last reward: 1.0
Mean reward in replay buffer: 2.3393939393939394 STD: 1.0505339507461726
Last reward:

Last reward: 1.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 1.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 2.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 1.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 1.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 0.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 0.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 2.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 0.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 1.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last reward: 1.0
Mean reward in replay buffer: 2.6903225806451614 STD: 0.9014255804469974
Last rewar

Last reward: 2.0
Mean reward in replay buffer: 2.827814569536424 STD: 0.9220382450778891
Last reward: 0.0
Mean reward in replay buffer: 2.8092105263157894 STD: 0.9471711289086739
Last reward: 2.0
Mean reward in replay buffer: 2.827814569536424 STD: 0.9220382450778891
Last reward: 2.0
Mean reward in replay buffer: 2.827814569536424 STD: 0.9220382450778891
Last reward: 1.0
Mean reward in replay buffer: 2.8157894736842106 STD: 0.9308619706300629
Last reward: 2.0
Mean reward in replay buffer: 2.827814569536424 STD: 0.9220382450778891
Last reward: 2.0
Mean reward in replay buffer: 2.827814569536424 STD: 0.9220382450778891
Last reward: 0.0
Mean reward in replay buffer: 2.8092105263157894 STD: 0.9471711289086739
Last reward: 2.0
Mean reward in replay buffer: 2.8223684210526314 STD: 0.9214297390627668
Last reward: 0.0
Mean reward in replay buffer: 2.8223684210526314 STD: 0.9214297390627668
Last reward: 0.0
Mean reward in replay buffer: 2.8223684210526314 STD: 0.9214297390627668
Last reward: 1.

Last reward: 2.0
Mean reward in replay buffer: 2.9395973154362416 STD: 0.9316375289768816
Last reward: 0.0
Mean reward in replay buffer: 2.9395973154362416 STD: 0.9316375289768816
Last reward: 0.0
Mean reward in replay buffer: 2.9395973154362416 STD: 0.9316375289768816
Last reward: 2.0
Mean reward in replay buffer: 2.9395973154362416 STD: 0.9316375289768816
Last reward: 0.0
Mean reward in replay buffer: 2.9395973154362416 STD: 0.9316375289768816
Last reward: 3.0
Mean reward in replay buffer: 2.9527027027027026 STD: 0.9282743807619815
Last reward: 0.0
Mean reward in replay buffer: 2.9328859060402683 STD: 0.9562343765210277
Last reward: 1.0
Mean reward in replay buffer: 2.9527027027027026 STD: 0.9282743807619815
Last reward: 3.0
Mean reward in replay buffer: 2.9594594594594597 STD: 0.9249260193131269
Last reward: 1.0
Mean reward in replay buffer: 2.9594594594594597 STD: 0.9249260193131269
Last reward: 1.0
Mean reward in replay buffer: 2.9594594594594597 STD: 0.9249260193131269
Last rewar

Last reward: 4.0
Mean reward in replay buffer: 3.1888111888111887 STD: 0.9112060492003338
Last reward: 1.0
Mean reward in replay buffer: 3.1888111888111887 STD: 0.9112060492003338
Last reward: 1.0
Mean reward in replay buffer: 3.1888111888111887 STD: 0.9112060492003338
Last reward: 0.0
Mean reward in replay buffer: 3.1888111888111887 STD: 0.9112060492003338
Last reward: 2.0
Mean reward in replay buffer: 3.1888111888111887 STD: 0.9112060492003338
Last reward: 3.0
Mean reward in replay buffer: 3.204225352112676 STD: 0.903411441450794
Last reward: 2.0
Mean reward in replay buffer: 3.204225352112676 STD: 0.903411441450794
Last reward: 1.0
Mean reward in replay buffer: 3.204225352112676 STD: 0.903411441450794
Last reward: 1.0
Mean reward in replay buffer: 3.204225352112676 STD: 0.903411441450794
Last reward: 1.0
Mean reward in replay buffer: 3.204225352112676 STD: 0.903411441450794
Last reward: 2.0
Mean reward in replay buffer: 3.204225352112676 STD: 0.903411441450794
Last reward: 0.0
Mean 

Last reward: 0.0
Mean reward in replay buffer: 3.3597122302158273 STD: 0.8341201582071966
Last reward: 1.0
Mean reward in replay buffer: 3.3597122302158273 STD: 0.8341201582071966
Last reward: 3.0
Mean reward in replay buffer: 3.366906474820144 STD: 0.8265861773469003
Last reward: 1.0
Mean reward in replay buffer: 3.366906474820144 STD: 0.8265861773469003
Last reward: 0.0
Mean reward in replay buffer: 3.366906474820144 STD: 0.8265861773469003
Last reward: 0.0
Mean reward in replay buffer: 3.366906474820144 STD: 0.8265861773469003
Last reward: 3.0
Mean reward in replay buffer: 3.3840579710144927 STD: 0.8134136750963862
Last reward: 0.0
Mean reward in replay buffer: 3.3597122302158273 STD: 0.859787589580801
Last reward: 3.0
Mean reward in replay buffer: 3.391304347826087 STD: 0.8054085657547835
Last reward: 3.0
Mean reward in replay buffer: 3.398550724637681 STD: 0.7972567479963706
Last reward: 1.0
Mean reward in replay buffer: 3.398550724637681 STD: 0.7972567479963706
Last reward: 1.0
M

Last reward: 1.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 3.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 0.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 1.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 2.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 0.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 3.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 2.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 0.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 1.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last reward: 1.0
Mean reward in replay buffer: 3.6194029850746268 STD: 1.0096882279124617
Last rewar

Last reward: 0.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 1.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 0.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 2.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 1.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 2.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 0.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 2.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 1.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 1.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 3.0
Mean reward in replay buffer: 3.757575757575758 STD: 1.1059948519489544
Last reward: 1.0
Mean

Last reward: 3.0
Mean reward in replay buffer: 3.89922480620155 STD: 1.2044511278502938
Last reward: 1.0
Mean reward in replay buffer: 3.89922480620155 STD: 1.2044511278502938
Last reward: 0.0
Mean reward in replay buffer: 3.8692307692307693 STD: 1.2475617244891157
Last reward: 0.0
Mean reward in replay buffer: 3.8692307692307693 STD: 1.2475617244891157
Last reward: 0.0
Mean reward in replay buffer: 3.8692307692307693 STD: 1.2475617244891157
Last reward: 3.0
Mean reward in replay buffer: 3.89922480620155 STD: 1.2044511278502938
Last reward: 0.0
Mean reward in replay buffer: 3.8692307692307693 STD: 1.2475617244891157
Last reward: 0.0
Mean reward in replay buffer: 3.8692307692307693 STD: 1.2475617244891157
Last reward: 2.0
Mean reward in replay buffer: 3.89922480620155 STD: 1.2044511278502938
Last reward: 1.0
Mean reward in replay buffer: 3.89922480620155 STD: 1.2044511278502938
Last reward: 0.0
Mean reward in replay buffer: 3.8692307692307693 STD: 1.2475617244891157
Last reward: 1.0
Mea

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = {'Reward':  stats.episode_rewards}
df = pd.DataFrame (data)

rolling_mean = df.Reward.rolling(window=50).mean()

plt.plot(df.index, df.Reward, label='SERI Reward')
plt.plot(df.index, rolling_mean, label='SERI MA Reward', color='orange')
plt.legend(loc='upper left')
plt.show()
df.to_csv('output_SERI2.csv', index=False)  