In [1]:
import numpy as np
import tensorflow as tf
import keras
import gym
from keras import layers, optimizers, losses
import matplotlib.pyplot as plt
import time


In [53]:
class experience_memory():

    def __init__(self, buffer_capacity, batch_size, state_dim, action_dim):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, state_dim), dtype=np.float32)
        self.action_buffer = np.zeros((self.buffer_capacity, action_dim), dtype=np.float32)
        self.reward_buffer = np.zeros((self.buffer_capacity, 1), dtype=np.float32)
        self.next_state_buffer = np.zeros((self.buffer_capacity, state_dim), dtype=np.float32)
        self.done_buffer = np.zeros((self.buffer_capacity, 1), dtype=np.float32)

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]


        self.buffer_counter += 1

class MountainCar():

    def __init__(self):
        
        self.env = gym.make("CartPole-v1")

        self.batch_size = 64
        self.max_memory_size = 10000

        self.state_dim = 4
        self.action_dim = 1

        self.gamma = 0.8
        self.tau = 0.1
        self.lower_action_bound = -1
        self.upper_action_bound = 1

        self.action_space = np.array([0,1])
        self.num_a = len(self.action_space)

        self.buffer = experience_memory(self.max_memory_size, self.batch_size, self.state_dim, self.action_dim)

        # init the neural netsf
        self.critic = self.get_critic_NN()
        self.target_critic = self.get_critic_NN()
        self.target_critic.set_weights(self.critic.get_weights())
        
        self.alpha = 0.1
        self.critic_optimizer = tf.keras.optimizers.Adam(self.alpha)

    #@tf.function
    def update(self, state_batch, action_batch, reward_batch, next_state_batch, done_batch):
             
    
        with tf.GradientTape() as tape:
            
            next_q_vals =  self.target_critic(next_state_batch, training=True)
            
            target_vals = tf.reshape(tf.reduce_max(next_q_vals, axis =1),[self.batch_size,1])
            y = reward_batch + (1-done_batch)* self.gamma*target_vals
           
            critic_value = self.critic(state_batch, training=True)
            
            mask = tf.ones((self.batch_size, self.num_a))
            critic_value[:, action_batch].assign(critic_value[:,action_batch] - y)
            y = tf.multiply(mask, y)
            
            dif = tf.add(y,-critic_value)
           
            dif = tf.reduce_sum(dif,axis = 0)
            
            critic_loss = tf.reduce_mean(tf.math.square(dif))
        
        
        critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grad, self.critic.trainable_variables))

        
    def learn(self):
        # get sample

        record_range = min(self.buffer.buffer_counter, self.buffer.buffer_capacity)

        batch_indices = np.random.choice(record_range, self.batch_size)
        
        state_batch = tf.convert_to_tensor(self.buffer.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.buffer.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.buffer.reward_buffer[batch_indices])
        next_state_batch = tf.convert_to_tensor(self.buffer.next_state_buffer[batch_indices])
        done_batch = tf.convert_to_tensor(self.buffer.done_buffer[batch_indices])
        
        self.update(state_batch, action_batch, reward_batch, next_state_batch,done_batch)

    @tf.function
    def update_target(self, target_weights, weights):
        for (a,b) in zip(target_weights, weights):
            a.assign(self.tau *b + (1-self.tau) *a)

    def get_critic_NN(self):
        # input [state, action]
        
        state_input = layers.Input(shape =(self.state_dim,))

        out = layers.Dense(32, activation = 'relu')(state_input)
        out = layers.BatchNormalization()(out)
        out = layers.Dense(32, activation = 'relu')(out)
        out = layers.Dense(self.num_a)(out)

        
        model = keras.Model(inputs = state_input, outputs = out)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate= 0.001),
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.MeanSquaredError()],
        )
        return model

    def epsilon_greedy(self, state, eps):

        q_vals = self.critic(state)
        
        if (eps > np.random.rand()):
           
            rand_ind = np.random.choice(self.num_a)
            
            return self.action_space[rand_ind]
        
        else:
            
            a_ind = tf.argmax(q_vals,axis = 1)
           
            return self.action_space[a_ind]
    
    def run_MC(self):
        done = False
        state = self.env.reset()
        state = tf.expand_dims(tf.convert_to_tensor(state),0)
        t_counter = 0
        while (True):
            self.env.render()
            time.sleep(0.01)
            
            a_ind = np.argmax(self.critic(state))
            
            action = self.action_space[a_ind]
            new_state, reward, done, info = self.env.step(action)
        
            new_state = tf.expand_dims(tf.convert_to_tensor(new_state.reshape(self.state_dim)),0)
            state = new_state
            t_counter += 1

            if (done):
                break
        print('Zeit: ', t_counter)
        

In [54]:
MC = MountainCar()

ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
num_episode = 3000

for ep in range(num_episode):
    
    done = False
    state = MC.env.reset()
    #state = tf.expand_dims(tf.convert_to_tensor(state),0)
    state = np.reshape(state, [1,MC.state_dim])
    episodic_reward = 0
    t_counter = 0

    if (ep % 100 == 0 and ep != 0):
        MC.run_MC()
    while(t_counter < 200):
        MC.alpha = np.max([0.01,0.1 * (1 / (1 +ep))])
        eps = np.max([0.01,0.9 * (100 / (100 +ep))])
        action = MC.epsilon_greedy(state, eps)
        
        new_state, reward, done, info = MC.env.step(action)
        
        #new_state = tf.expand_dims(tf.convert_to_tensor(new_state.reshape(MC.state_dim)),0)
        new_state = np.reshape(new_state, [1,MC.state_dim])
        episodic_reward += reward
        
        MC.buffer.record((state,action,reward, new_state, done))
       
        MC.learn()
    
        MC.update_target(MC.target_critic.variables, MC.critic.variables)

        state = new_state
        t_counter +=1
        if (done):
            break
    ep_reward_list.append(episodic_reward)
    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-20:])
    print("Episode * {} * AVG Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)
# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()



ValueError: No gradients provided for any variable: (['dense_144/kernel:0', 'dense_144/bias:0', 'batch_normalization_48/gamma:0', 'batch_normalization_48/beta:0', 'dense_145/kernel:0', 'dense_145/bias:0', 'dense_146/kernel:0', 'dense_146/bias:0'],). Provided `grads_and_vars` is ((None, <tf.Variable 'dense_144/kernel:0' shape=(4, 32) dtype=float32, numpy=
array([[-4.00936395e-01, -2.73486882e-01,  1.22568667e-01,
         6.96156919e-02, -9.44505632e-02,  4.23667431e-02,
        -1.74931169e-01,  2.66924918e-01,  3.07104945e-01,
        -3.51616383e-01,  1.83020175e-01,  2.73801386e-01,
         3.20230424e-01, -2.18223467e-01,  2.97599316e-01,
        -4.06813115e-01, -1.49141848e-02, -2.08690673e-01,
        -2.18872100e-01,  2.55886197e-01,  3.06799531e-01,
         2.89450407e-01, -3.70396882e-01, -1.94990143e-01,
         1.26841724e-01,  1.57133043e-01, -4.00922060e-01,
        -2.48249635e-01,  3.04789484e-01,  4.02402043e-01,
        -3.99610877e-01,  1.71460986e-01],
       [-2.66813934e-01,  3.51128161e-01,  2.45302677e-01,
         2.66947091e-01, -2.34754965e-01, -2.01329798e-01,
        -2.44265169e-01,  2.25244284e-01,  3.09244931e-01,
         2.17825055e-01,  1.72226548e-01, -3.07151884e-01,
        -2.22975209e-01,  1.00005925e-01, -3.86716276e-01,
        -8.73490572e-02,  4.01823103e-01, -2.74759322e-01,
         1.46090448e-01,  1.93512201e-01, -4.81333137e-02,
         8.66463184e-02, -9.58945155e-02, -1.44664079e-01,
        -3.33165556e-01,  1.54869556e-01, -2.29780897e-01,
         3.05997729e-02, -7.59783089e-02, -3.94917250e-01,
        -1.47163868e-04,  1.72646999e-01],
       [-3.97770017e-01,  2.12070704e-01,  3.85072708e-01,
        -2.50706851e-01, -1.39537305e-01,  1.48036003e-01,
        -9.17261839e-02,  4.97787297e-02, -2.68852800e-01,
        -9.83342826e-02, -6.56511784e-02, -1.63328856e-01,
        -1.84429884e-01, -2.51028419e-01, -8.40532482e-02,
        -4.41760123e-02,  1.04923487e-01, -2.40390599e-02,
         2.48792291e-01, -1.93467438e-01, -2.64415443e-01,
        -1.35006607e-01, -1.25621974e-01, -4.03649479e-01,
        -2.01423332e-01, -9.32657123e-02, -2.92053044e-01,
         7.21679032e-02,  1.28977954e-01, -1.46484435e-01,
        -8.31231475e-03, -3.72183621e-02],
       [ 1.68977678e-02,  1.43736303e-01,  2.65276730e-01,
        -3.69222045e-01,  2.20938981e-01, -3.70237529e-01,
        -1.32144302e-01, -2.54301488e-01, -3.36276054e-01,
         6.96541369e-02, -1.19539082e-01, -2.78853476e-01,
         1.85334206e-01,  3.87022197e-01,  1.11860394e-01,
        -1.34776711e-01,  5.81405163e-02,  1.68263674e-01,
        -1.52286708e-01,  2.69138217e-01,  2.48173714e-01,
        -7.60022700e-02, -1.23761803e-01,  5.62369525e-02,
        -1.13178581e-01, -2.34067976e-01,  2.97794282e-01,
        -1.52031302e-01, -3.35489303e-01, -1.84138268e-01,
        -6.88560009e-02, -3.40509802e-01]], dtype=float32)>), (None, <tf.Variable 'dense_144/bias:0' shape=(32,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>), (None, <tf.Variable 'batch_normalization_48/gamma:0' shape=(32,) dtype=float32, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
      dtype=float32)>), (None, <tf.Variable 'batch_normalization_48/beta:0' shape=(32,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>), (None, <tf.Variable 'dense_145/kernel:0' shape=(32, 32) dtype=float32, numpy=
array([[ 0.27584746, -0.06496839, -0.09250213, ...,  0.0032478 ,
         0.09209427,  0.05609125],
       [ 0.26899102, -0.18551037,  0.27240524, ...,  0.0933547 ,
         0.1751354 , -0.06559962],
       [ 0.25935897,  0.06988373,  0.01142356, ..., -0.2896401 ,
         0.10134986, -0.04356027],
       ...,
       [-0.1810588 , -0.24481012,  0.10682935, ..., -0.19621429,
         0.28625175, -0.2935816 ],
       [ 0.2693306 , -0.11832605,  0.09773582, ..., -0.26795778,
        -0.15686703,  0.29578075],
       [-0.16326559, -0.12623784, -0.0426068 , ...,  0.19628242,
        -0.21311536,  0.29363546]], dtype=float32)>), (None, <tf.Variable 'dense_145/bias:0' shape=(32,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>), (None, <tf.Variable 'dense_146/kernel:0' shape=(32, 2) dtype=float32, numpy=
array([[-0.07349417, -0.08894059],
       [-0.3475104 , -0.09511098],
       [ 0.27645335,  0.32201687],
       [ 0.25151494, -0.35627246],
       [-0.04199424, -0.33878264],
       [-0.40793544, -0.3201866 ],
       [ 0.3453938 ,  0.05230099],
       [ 0.14364138,  0.11987033],
       [ 0.2981361 ,  0.00301608],
       [-0.18693806, -0.22724429],
       [-0.01885152,  0.0340575 ],
       [ 0.3465357 ,  0.00538707],
       [ 0.34752986, -0.05299577],
       [ 0.3433369 ,  0.26623127],
       [ 0.2642022 , -0.09142986],
       [ 0.37769786,  0.02710146],
       [-0.24965996, -0.29034677],
       [ 0.21406886, -0.04898313],
       [ 0.24426469, -0.27606606],
       [-0.20602977, -0.08940771],
       [ 0.20729873, -0.08199587],
       [ 0.26309648,  0.36739483],
       [ 0.13651058, -0.35549104],
       [-0.29195088,  0.10620824],
       [ 0.03803366, -0.01575941],
       [-0.14032584, -0.23998621],
       [ 0.15245304, -0.22797431],
       [-0.1606122 , -0.01687026],
       [-0.1956373 ,  0.19712535],
       [-0.1534813 , -0.21791126],
       [ 0.22740343,  0.06818458],
       [ 0.16559258,  0.1989766 ]], dtype=float32)>), (None, <tf.Variable 'dense_146/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>)).

In [212]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gym # for environment
from collections import deque
from keras.layers import Dense

import random

In [213]:
class DQLAgent(): 
    
    def __init__(self, env):
        # parameters and hyperparameters
        
        # this part is for neural network or build_model()
        self.state_size = env.observation_space.shape[0] # this is for input of neural network node size
        self.action_size = env.action_space.n # this is for out of neural network node size
        
        # this part is for replay()
        self.gamma = 0.95
        self.learning_rate = 0.001
        
        # this part is for adaptiveEGreedy()
        self.epsilon = 1 # initial exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000) # a list with 1000 memory, if it becomes full first inputs will be deleted
        
        self.model = self.build_model()
    
    def build_model(self):
        # neural network for deep Q learning
        model = keras.Sequential()
        model.add(Dense(48, input_dim = self.state_size, activation = 'tanh')) # first hidden layer
        model.add(Dense(self.action_size, activation = 'linear')) # output layer
        model.compile(loss = 'mse', optimizer = tf.keras.optimizers.Adam(lr = self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        # storage
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        # acting, exploit or explore
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
            
    
    def replay(self, batch_size):
        # training
        
        if len(self.memory) < batch_size:
            return # memory is still not full
        
        minibatch = random.sample(self.memory, batch_size) # take 16 (batch_size) random samples from memory
        for state, action, reward, next_state, done in minibatch:
            if done: # if the game is over, I dont have next state, I just have reward 
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) 
                # target = R(s,a) + gamma * max Q`(s`,a`)
                # target (max Q` value) is output of Neural Network which takes s` as an input 
                # amax(): flatten the lists (make them 1 list) and take max value
            train_target = self.model.predict(state) # s --> NN --> Q(s,a)=train_target
            train_target[0][action] = target
            self.model.fit(state, train_target, verbose = 0) # verbose: dont show loss and epoch
    
    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            

if __name__ == "__main__":
    
    # initialize gym environment and agent
    env = gym.make('CartPole-v0')
    agent = DQLAgent(env)

    batch_size = 16
    episodes = 50
    for e in range(episodes):
        
        # initialize environment
        state = env.reset()
        state = np.reshape(state, [1,4])
        
        time = 0 # each second I will get reward, because I want to sustain a balance forever
        while True:
            
            # act
            action = agent.act(state)
            
            # step
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            
            # remember / storage
            agent.remember(state, action, reward, next_state, done)
            
            # update state
            state = next_state
            
            # replay
            agent.replay(batch_size)
            
            # adjust epsilon
            agent.adaptiveEGreedy()
            
            time += 1
            
            if done:
                print('episode: {}, time: {}'.format(e, time))
                break

  logger.warn(
  super(Adam, self).__init__(name, **kwargs)


episode: 0, time: 11
episode: 1, time: 15
episode: 2, time: 25
episode: 3, time: 10
episode: 4, time: 40
episode: 5, time: 46
episode: 6, time: 21
episode: 7, time: 29
episode: 8, time: 20
episode: 9, time: 12
episode: 10, time: 18
episode: 11, time: 36
episode: 12, time: 68
episode: 13, time: 63
episode: 14, time: 49
episode: 15, time: 39
episode: 16, time: 24
episode: 17, time: 24
episode: 18, time: 49
episode: 19, time: 59
episode: 20, time: 30
episode: 21, time: 62
episode: 22, time: 32
episode: 23, time: 65
episode: 24, time: 67
episode: 25, time: 65
episode: 26, time: 63
episode: 27, time: 36
episode: 28, time: 56
episode: 29, time: 66
episode: 30, time: 47
episode: 31, time: 54
episode: 32, time: 55
episode: 33, time: 121
episode: 34, time: 60
episode: 35, time: 94
episode: 36, time: 75
episode: 37, time: 146
episode: 38, time: 114
episode: 39, time: 123
episode: 40, time: 200
episode: 41, time: 200
episode: 42, time: 200
episode: 43, time: 200
episode: 44, time: 184


KeyboardInterrupt: 