In [11]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

# for animation
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

In [12]:

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0), dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)

    anim.save('movie_cartpole_DQN')
    display(display_animation(anim, default_mode='loop'))

In [13]:

from collections import namedtuple

Transition = namedtuple(
    'Transition', ('state', 'action', 'next_state', 'reward')
)

# Constants
ENV = 'CartPole-v0'
GAMMA = 0.99
MAX_STEPS = 200
NUM_EPISODES = 500


In [14]:

# Class for Transition(Memory)
class ReplayMemory:
    def __init__(self, CAPACITY):
        self.capacity = CAPACITY # memory max capacity
        self.memory = [] # transition variable(list)
        self.index = 0 # save location(index)
    def push(self, state, action, state_next, reward):
        if len(self.memory) < self.capacity:
            self.memory.append(None) # NOT full memory

        # save memory using namedtuple Transition
        self.memory[self.index] = Transition(state, action, state_next, reward)

        self.index = (self.index + 1) % self.capacity

    # transition batch sampling 
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    # ? 그냥 길이를 재는 함수 아닌가?
    # get saved transition
    def __len__(self):
        return len(self.memory)


In [15]:

#-----------------------------------------------------------------------
import random
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

# constants
BATCH_SIZE = 32
CAPACITY = 10000

In [16]:

class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions
        # memory transition !OBJECT!
        self.memory = ReplayMemory(CAPACITY)

        # maek NN
        self.model = nn.Sequential()
        self.model.add_module('fc1', nn.Linear(num_states, 32))
        self.model.add_module('relu1', nn.ReLU())
        self.model.add_module('fc2', nn.Linear(32, 32))
        self.model.add_module('relu2', nn.ReLU())
        self.model.add_module('fc3', nn.Linear(32, num_actions))
        # to see the model structure
        print(self.model)

        # optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
    def replay(self):
        # if size of transition is smaller than batch size, do NOTHING
        if len(self.memory)< BATCH_SIZE:
            return
        # get minibatch(self.memory = ReplayMemory object)
        transitions = self.memory.sample(BATCH_SIZE)
        # transformation (s,a,s_n,r)*batch -> (s*batch,a*batch,s_n*batch,r*batch)
        batch = Transition(*zip(*transitions))

        # reshaping into minibatch, make Variable for NN
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

        # Evaluation Mode
        self.model.eval()


        # calculate Q(s_t, a_t)
        state_action_values = self.model(state_batch).gather(1, action_batch)
        # calculate max{Q(s_t+1, a)}
        # check that there is next_state
        non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None, batch.next_state)))
        next_state_values = torch.zeros(BATCH_SIZE)

        next_state_values[non_final_mask] = self.model(non_final_next_states).max(1)[0].detach()

        expected_state_action_values = reward_batch + GAMMA * next_state_values

        # Training Mode (update the weights)
        self.model.train()

        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # update the weights
        self.optimizer.zero_grad() #initialize
        loss.backward() #backpropagation
        self.optimizer.step() # update
    def decide_action(self, state, episode):
        # e-greedy algorithm
        epsilon = 0.5 * (1/(episode+1))

        if epsilon <= np.random.uniform(0,1):
            self.model.eval() # Evaluation Mode
            with torch.no_grad():
                action = self.model(state).max(1)[1].view(1,1)
        else:
            #random action
            action = torch.LongTensor(
                [[random.randrange(self.num_actions)]]
            ) # action = [torch.LongTensor of size 1*1]
        return action


In [17]:

# Agent
class Agent:
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)
    def update_q_function(self):
        self.brain.replay()
    def get_action(self, state, episode):
        action = self.brain.decide_action(state, episode)
        return action
    def memorize(self, state, action, state_next, reward):
        self.brain.memory.push(state, action, state_next, reward)

# Env

In [18]:

# Env
class Environment:
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.n
        self.agent = Agent(num_states, num_actions)

    def run(self):
        episode_10_list = np.zeros(10)
        complete_episodes = 0
        episode_final = False
        frames = []

        for episode in range(NUM_EPISODES):
            observation = self.env.reset()

            state = observation
            state = torch.from_numpy(state).type(torch.FloatTensor)
            state = torch.unsqueeze(state, 0)

            for step in range(MAX_STEPS):
                if episode_final is True:
                    frames.append(self.env.render(mode='rgb_array'))
                action = self.agent.get_action(state, episode)

                observation_next, _, done, _ = self.env.step(action.item())

                if done:
                    state_next = None
                    episode_10_list = np.hstack((episode_10_list[1:], step + 1))

                    if step < 195:
                        reward = torch.FloatTensor([-1.0])
                        complete_episodes = 0
                    else:
                        reward = torch.FloatTensor([1.0])
                        complete_episodes = complete_episodes + 1
                else:
                    reward = torch.FloatTensor([0.0])
                    state_next = observation_next
                    state_next = torch.from_numpy(state_next).type(torch.FloatTensor)
                    state_next = torch.unsqueeze(state_next, 0)

                self.agent.memorize(state, action, state_next, reward)

                self.agent.update_q_function()

                state = state_next

                if done:
                    print('%d Episode: Finished after %d steps: last 10 episodes'' mean step = %.1f' % (
                        episode, step+1, episode_10_list.mean()
                    ))
                    break
            if episode_final is True:
                display_frames_as_gif(frames)
            if complete_episodes >= 10:
                print('10 episode success')
                episode_final = True


In [None]:
cartpole_env = Environment()
#cartpole_env.render()
cartpole_env.run()



Sequential(
  (fc1): Linear(in_features=4, out_features=32, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)
0 Episode: Finished after 17 steps: last 10 episodes mean step = 1.7
1 Episode: Finished after 10 steps: last 10 episodes mean step = 2.7
2 Episode: Finished after 12 steps: last 10 episodes mean step = 3.9
3 Episode: Finished after 10 steps: last 10 episodes mean step = 4.9
4 Episode: Finished after 9 steps: last 10 episodes mean step = 5.8
5 Episode: Finished after 8 steps: last 10 episodes mean step = 6.6
6 Episode: Finished after 10 steps: last 10 episodes mean step = 7.6




7 Episode: Finished after 9 steps: last 10 episodes mean step = 8.5
8 Episode: Finished after 9 steps: last 10 episodes mean step = 9.4
9 Episode: Finished after 9 steps: last 10 episodes mean step = 10.3
10 Episode: Finished after 10 steps: last 10 episodes mean step = 9.6
11 Episode: Finished after 9 steps: last 10 episodes mean step = 9.5




12 Episode: Finished after 11 steps: last 10 episodes mean step = 9.4
13 Episode: Finished after 8 steps: last 10 episodes mean step = 9.2
14 Episode: Finished after 10 steps: last 10 episodes mean step = 9.3
15 Episode: Finished after 14 steps: last 10 episodes mean step = 9.9
16 Episode: Finished after 10 steps: last 10 episodes mean step = 9.9




17 Episode: Finished after 14 steps: last 10 episodes mean step = 10.4
18 Episode: Finished after 14 steps: last 10 episodes mean step = 10.9
19 Episode: Finished after 13 steps: last 10 episodes mean step = 11.3
20 Episode: Finished after 12 steps: last 10 episodes mean step = 11.5
21 Episode: Finished after 13 steps: last 10 episodes mean step = 11.9




22 Episode: Finished after 20 steps: last 10 episodes mean step = 12.8
23 Episode: Finished after 15 steps: last 10 episodes mean step = 13.5
24 Episode: Finished after 18 steps: last 10 episodes mean step = 14.3




25 Episode: Finished after 14 steps: last 10 episodes mean step = 14.3
26 Episode: Finished after 15 steps: last 10 episodes mean step = 14.8
27 Episode: Finished after 17 steps: last 10 episodes mean step = 15.1
28 Episode: Finished after 18 steps: last 10 episodes mean step = 15.5




29 Episode: Finished after 16 steps: last 10 episodes mean step = 15.8
30 Episode: Finished after 13 steps: last 10 episodes mean step = 15.9
31 Episode: Finished after 19 steps: last 10 episodes mean step = 16.5
32 Episode: Finished after 16 steps: last 10 episodes mean step = 16.1




33 Episode: Finished after 17 steps: last 10 episodes mean step = 16.3
34 Episode: Finished after 15 steps: last 10 episodes mean step = 16.0
35 Episode: Finished after 17 steps: last 10 episodes mean step = 16.3
36 Episode: Finished after 16 steps: last 10 episodes mean step = 16.4




37 Episode: Finished after 13 steps: last 10 episodes mean step = 16.0
38 Episode: Finished after 20 steps: last 10 episodes mean step = 16.2
39 Episode: Finished after 14 steps: last 10 episodes mean step = 16.0
40 Episode: Finished after 15 steps: last 10 episodes mean step = 16.2




41 Episode: Finished after 18 steps: last 10 episodes mean step = 16.1
42 Episode: Finished after 16 steps: last 10 episodes mean step = 16.1
43 Episode: Finished after 14 steps: last 10 episodes mean step = 15.8
44 Episode: Finished after 13 steps: last 10 episodes mean step = 15.6




45 Episode: Finished after 8 steps: last 10 episodes mean step = 14.7
46 Episode: Finished after 9 steps: last 10 episodes mean step = 14.0
47 Episode: Finished after 8 steps: last 10 episodes mean step = 13.5
48 Episode: Finished after 9 steps: last 10 episodes mean step = 12.4
49 Episode: Finished after 10 steps: last 10 episodes mean step = 12.0
50 Episode: Finished after 11 steps: last 10 episodes mean step = 11.6




51 Episode: Finished after 15 steps: last 10 episodes mean step = 11.3
52 Episode: Finished after 15 steps: last 10 episodes mean step = 11.2
53 Episode: Finished after 16 steps: last 10 episodes mean step = 11.4




54 Episode: Finished after 16 steps: last 10 episodes mean step = 11.7
55 Episode: Finished after 19 steps: last 10 episodes mean step = 12.8
56 Episode: Finished after 20 steps: last 10 episodes mean step = 13.9




57 Episode: Finished after 16 steps: last 10 episodes mean step = 14.7
58 Episode: Finished after 17 steps: last 10 episodes mean step = 15.5
59 Episode: Finished after 11 steps: last 10 episodes mean step = 15.6
60 Episode: Finished after 13 steps: last 10 episodes mean step = 15.8
61 Episode: Finished after 16 steps: last 10 episodes mean step = 15.9




62 Episode: Finished after 17 steps: last 10 episodes mean step = 16.1
63 Episode: Finished after 14 steps: last 10 episodes mean step = 15.9
64 Episode: Finished after 16 steps: last 10 episodes mean step = 15.9




65 Episode: Finished after 16 steps: last 10 episodes mean step = 15.6
66 Episode: Finished after 15 steps: last 10 episodes mean step = 15.1
67 Episode: Finished after 15 steps: last 10 episodes mean step = 15.0
68 Episode: Finished after 12 steps: last 10 episodes mean step = 14.5
69 Episode: Finished after 13 steps: last 10 episodes mean step = 14.7




70 Episode: Finished after 11 steps: last 10 episodes mean step = 14.5
71 Episode: Finished after 12 steps: last 10 episodes mean step = 14.1
72 Episode: Finished after 11 steps: last 10 episodes mean step = 13.5
73 Episode: Finished after 14 steps: last 10 episodes mean step = 13.5
74 Episode: Finished after 11 steps: last 10 episodes mean step = 13.0




75 Episode: Finished after 11 steps: last 10 episodes mean step = 12.5
76 Episode: Finished after 12 steps: last 10 episodes mean step = 12.2
77 Episode: Finished after 13 steps: last 10 episodes mean step = 12.0
78 Episode: Finished after 15 steps: last 10 episodes mean step = 12.3
79 Episode: Finished after 13 steps: last 10 episodes mean step = 12.3




80 Episode: Finished after 14 steps: last 10 episodes mean step = 12.6
81 Episode: Finished after 14 steps: last 10 episodes mean step = 12.8
82 Episode: Finished after 13 steps: last 10 episodes mean step = 13.0
83 Episode: Finished after 16 steps: last 10 episodes mean step = 13.2




84 Episode: Finished after 13 steps: last 10 episodes mean step = 13.4
85 Episode: Finished after 16 steps: last 10 episodes mean step = 13.9
86 Episode: Finished after 15 steps: last 10 episodes mean step = 14.2
87 Episode: Finished after 16 steps: last 10 episodes mean step = 14.5




88 Episode: Finished after 15 steps: last 10 episodes mean step = 14.5
89 Episode: Finished after 22 steps: last 10 episodes mean step = 15.4
90 Episode: Finished after 17 steps: last 10 episodes mean step = 15.7




91 Episode: Finished after 21 steps: last 10 episodes mean step = 16.4
92 Episode: Finished after 31 steps: last 10 episodes mean step = 18.2




93 Episode: Finished after 39 steps: last 10 episodes mean step = 20.5
94 Episode: Finished after 24 steps: last 10 episodes mean step = 21.6




95 Episode: Finished after 83 steps: last 10 episodes mean step = 28.3
96 Episode: Finished after 28 steps: last 10 episodes mean step = 29.6




97 Episode: Finished after 25 steps: last 10 episodes mean step = 30.5
98 Episode: Finished after 28 steps: last 10 episodes mean step = 31.8
99 Episode: Finished after 25 steps: last 10 episodes mean step = 32.1




100 Episode: Finished after 32 steps: last 10 episodes mean step = 33.6
101 Episode: Finished after 37 steps: last 10 episodes mean step = 35.2




102 Episode: Finished after 82 steps: last 10 episodes mean step = 40.3
103 Episode: Finished after 36 steps: last 10 episodes mean step = 40.0




104 Episode: Finished after 30 steps: last 10 episodes mean step = 40.6
105 Episode: Finished after 21 steps: last 10 episodes mean step = 34.4




106 Episode: Finished after 65 steps: last 10 episodes mean step = 38.1
107 Episode: Finished after 31 steps: last 10 episodes mean step = 38.7




108 Episode: Finished after 41 steps: last 10 episodes mean step = 40.0




109 Episode: Finished after 70 steps: last 10 episodes mean step = 44.5
110 Episode: Finished after 36 steps: last 10 episodes mean step = 44.9




111 Episode: Finished after 85 steps: last 10 episodes mean step = 49.7
112 Episode: Finished after 40 steps: last 10 episodes mean step = 45.5




113 Episode: Finished after 26 steps: last 10 episodes mean step = 44.5
114 Episode: Finished after 46 steps: last 10 episodes mean step = 46.1




115 Episode: Finished after 41 steps: last 10 episodes mean step = 48.1
116 Episode: Finished after 37 steps: last 10 episodes mean step = 45.3




117 Episode: Finished after 22 steps: last 10 episodes mean step = 44.4
118 Episode: Finished after 53 steps: last 10 episodes mean step = 45.6




119 Episode: Finished after 26 steps: last 10 episodes mean step = 41.2
120 Episode: Finished after 29 steps: last 10 episodes mean step = 40.5




121 Episode: Finished after 67 steps: last 10 episodes mean step = 38.7
122 Episode: Finished after 34 steps: last 10 episodes mean step = 38.1




123 Episode: Finished after 34 steps: last 10 episodes mean step = 38.9
124 Episode: Finished after 30 steps: last 10 episodes mean step = 37.3




125 Episode: Finished after 55 steps: last 10 episodes mean step = 38.7
126 Episode: Finished after 30 steps: last 10 episodes mean step = 38.0




127 Episode: Finished after 56 steps: last 10 episodes mean step = 41.4
128 Episode: Finished after 48 steps: last 10 episodes mean step = 40.9




129 Episode: Finished after 37 steps: last 10 episodes mean step = 42.0
130 Episode: Finished after 43 steps: last 10 episodes mean step = 43.4




131 Episode: Finished after 29 steps: last 10 episodes mean step = 39.6
132 Episode: Finished after 31 steps: last 10 episodes mean step = 39.3




133 Episode: Finished after 38 steps: last 10 episodes mean step = 39.7




134 Episode: Finished after 94 steps: last 10 episodes mean step = 46.1




135 Episode: Finished after 57 steps: last 10 episodes mean step = 46.3




136 Episode: Finished after 71 steps: last 10 episodes mean step = 50.4




137 Episode: Finished after 84 steps: last 10 episodes mean step = 53.2




138 Episode: Finished after 62 steps: last 10 episodes mean step = 54.6
139 Episode: Finished after 55 steps: last 10 episodes mean step = 56.4




140 Episode: Finished after 38 steps: last 10 episodes mean step = 55.9




141 Episode: Finished after 61 steps: last 10 episodes mean step = 59.1




142 Episode: Finished after 94 steps: last 10 episodes mean step = 65.4




143 Episode: Finished after 60 steps: last 10 episodes mean step = 67.6




144 Episode: Finished after 113 steps: last 10 episodes mean step = 69.5




145 Episode: Finished after 65 steps: last 10 episodes mean step = 70.3




146 Episode: Finished after 71 steps: last 10 episodes mean step = 70.3




147 Episode: Finished after 52 steps: last 10 episodes mean step = 67.1




148 Episode: Finished after 73 steps: last 10 episodes mean step = 68.2




149 Episode: Finished after 166 steps: last 10 episodes mean step = 79.3




150 Episode: Finished after 82 steps: last 10 episodes mean step = 83.7




151 Episode: Finished after 80 steps: last 10 episodes mean step = 85.6




152 Episode: Finished after 92 steps: last 10 episodes mean step = 85.4




153 Episode: Finished after 125 steps: last 10 episodes mean step = 91.9




154 Episode: Finished after 110 steps: last 10 episodes mean step = 91.6




155 Episode: Finished after 168 steps: last 10 episodes mean step = 101.9




156 Episode: Finished after 165 steps: last 10 episodes mean step = 111.3




157 Episode: Finished after 149 steps: last 10 episodes mean step = 121.0




158 Episode: Finished after 116 steps: last 10 episodes mean step = 125.3




159 Episode: Finished after 200 steps: last 10 episodes mean step = 128.7




160 Episode: Finished after 200 steps: last 10 episodes mean step = 140.5




161 Episode: Finished after 200 steps: last 10 episodes mean step = 152.5




162 Episode: Finished after 200 steps: last 10 episodes mean step = 163.3




163 Episode: Finished after 200 steps: last 10 episodes mean step = 170.8




164 Episode: Finished after 200 steps: last 10 episodes mean step = 179.8




165 Episode: Finished after 200 steps: last 10 episodes mean step = 183.0




166 Episode: Finished after 200 steps: last 10 episodes mean step = 186.5




167 Episode: Finished after 200 steps: last 10 episodes mean step = 191.6




168 Episode: Finished after 200 steps: last 10 episodes mean step = 200.0
10 episode success
