In [1]:
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
%matplotlib inline
import gym
import gridworld
from gym import wrappers, logger
import torch.optim as optim
import pandas as pd
from torch import nn
import seaborn as sns
import os,sys,inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from modules.utils import ReplayMemory
from modules.DQN import DQNAgent

In [6]:
if __name__ == '__main__':

    ### parameters ###
    BUFFER_SIZE = 200
    BATCH_SIZE = 20

    GAMMA = 0.70

    EPS = 0.2
    EPS_DECAY = 0.00001
    UPDATE_FREQ = 20


    inSize  = 4
    outSize = 2
    layers = [24, 24]

    ### Environment setting ###
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    outdir = env_name+'/DQN-agent-results'
    envm = wrappers.Monitor(env, directory=outdir, force=True, video_callable=False)
    env.seed(0)
    env.reset()
    done = False
    verbose = False


    ### Initialization ###  DQN + Experience replay
    replay_mem = ReplayMemory(BUFFER_SIZE, BATCH_SIZE)
    dqn_target = DQNAgent(inSize, outSize, layers, eps=EPS, eps_decay=EPS_DECAY).double()
    dqn_agent = DQNAgent(inSize, outSize, layers).double()


    ### Training Settings ###
    episode_count = 100000
    optimizer = optim.Adam(dqn_agent.parameters())
    huberLoss = nn.SmoothL1Loss()
    rsum_hist = []

    ### Training loop ###

    for episode in range(episode_count): 

        it = 0
        obs = envm.reset()

        rsum = 0.0
        done = False

        while(True):

            if verbose == True:
                env.render()
                
            state = obs
            action = dqn_agent.act(state, env)
            obs, reward, done, _ = envm.step(action)
            rsum += reward

            
            ### storing experience ###
            replay_mem.store(state, action, reward, obs, done)

            if len(replay_mem) > BATCH_SIZE:
                
                ### sampling batchs ###
                states, actions, rewards, next_states, dones = replay_mem.sample()
                
                ### Updating DQN agent ###
                q_targets = rewards + GAMMA * dqn_target.evaluate_max(next_states) * (1.0 - dones.double())
                
                ### forward + Computing loss + backprop ###
                optimizer.zero_grad()
                q_estimates = dqn_agent.evaluate(states, actions)
                loss = huberLoss(q_estimates, q_targets)
                loss.backward()
                optimizer.step()

            ### Resetting target DQN ###
            if it % UPDATE_FREQ == 0:
                dqn_target = dqn_agent

            if done == True:
                print("Episode : " + str(episode) + " rsum=" + str(rsum) +  " iter = "+ str(it) + "eps = " + str(EPS))
                rsum_hist.append(rsum)
                break
                
            it += 1

Episode : 0 rsum=11.0 iter = 10eps = 0.2
Episode : 1 rsum=8.0 iter = 7eps = 0.2
Episode : 2 rsum=15.0 iter = 14eps = 0.2
Episode : 3 rsum=10.0 iter = 9eps = 0.2
Episode : 4 rsum=9.0 iter = 8eps = 0.2
Episode : 5 rsum=11.0 iter = 10eps = 0.2
Episode : 6 rsum=9.0 iter = 8eps = 0.2
Episode : 7 rsum=9.0 iter = 8eps = 0.2
Episode : 8 rsum=9.0 iter = 8eps = 0.2
Episode : 9 rsum=8.0 iter = 7eps = 0.2
Episode : 10 rsum=11.0 iter = 10eps = 0.2
Episode : 11 rsum=8.0 iter = 7eps = 0.2
Episode : 12 rsum=9.0 iter = 8eps = 0.2
Episode : 13 rsum=10.0 iter = 9eps = 0.2
Episode : 14 rsum=8.0 iter = 7eps = 0.2
Episode : 15 rsum=11.0 iter = 10eps = 0.2
Episode : 16 rsum=8.0 iter = 7eps = 0.2
Episode : 17 rsum=11.0 iter = 10eps = 0.2
Episode : 18 rsum=13.0 iter = 12eps = 0.2
Episode : 19 rsum=8.0 iter = 7eps = 0.2
Episode : 20 rsum=10.0 iter = 9eps = 0.2
Episode : 21 rsum=9.0 iter = 8eps = 0.2
Episode : 22 rsum=10.0 iter = 9eps = 0.2
Episode : 23 rsum=9.0 iter = 8eps = 0.2
Episode : 24 rsum=11.0 iter = 10

Episode : 194 rsum=59.0 iter = 58eps = 0.2
Episode : 195 rsum=184.0 iter = 183eps = 0.2
Episode : 196 rsum=58.0 iter = 57eps = 0.2
Episode : 197 rsum=27.0 iter = 26eps = 0.2
Episode : 198 rsum=79.0 iter = 78eps = 0.2
Episode : 199 rsum=140.0 iter = 139eps = 0.2
Episode : 200 rsum=139.0 iter = 138eps = 0.2
Episode : 201 rsum=100.0 iter = 99eps = 0.2
Episode : 202 rsum=193.0 iter = 192eps = 0.2
Episode : 203 rsum=166.0 iter = 165eps = 0.2
Episode : 204 rsum=93.0 iter = 92eps = 0.2
Episode : 205 rsum=327.0 iter = 326eps = 0.2
Episode : 206 rsum=231.0 iter = 230eps = 0.2
Episode : 207 rsum=34.0 iter = 33eps = 0.2
Episode : 208 rsum=60.0 iter = 59eps = 0.2
Episode : 209 rsum=13.0 iter = 12eps = 0.2
Episode : 210 rsum=113.0 iter = 112eps = 0.2
Episode : 211 rsum=122.0 iter = 121eps = 0.2
Episode : 212 rsum=259.0 iter = 258eps = 0.2
Episode : 213 rsum=31.0 iter = 30eps = 0.2
Episode : 214 rsum=111.0 iter = 110eps = 0.2
Episode : 215 rsum=245.0 iter = 244eps = 0.2
Episode : 216 rsum=25.0 iter 

KeyboardInterrupt: 

In [9]:
env.close()

In [8]:
import seaborn as sns
import pandas as pd
window = 20
rsum_hist = pd.concat([pd.Series(rsum_hist, name='mean').rolling(window).mean(),
           pd.Series(rsum_hist, name='std').rolling(window).std()],
          axis=1)

sns.set()
plt.figure(figsize=(10,5))
ax = rsum_hist['mean'].plot()

ax.fill_between(rsum_hist.index, rsum_hist['mean'] - rsum_hist['std'], rsum_hist['mean'] + rsum_hist['std'],
                alpha=.25)
plt.tight_layout()
plt.ylabel("Cumulated sum of rewards (Y)")
plt.title('%s :  $\gamma = %.2f$ | $\epsilon = %.2f$ | $ update \: rate \:(C) = %d$'%(env_name, GAMMA, EPS, UPDATE_FREQ))
plt.legend(['70-Episod rolling mean of Y', '70-Episod rolling std of Y'])
plt.xlabel('Episod')
sns.despine()


ValueError: Wrong number of items passed 2, placement implies 291