In [1]:
import numpy as np
import tensorflow as tf
import keras
import gym
from keras import layers, optimizers, losses
import matplotlib.pyplot as plt
import time


In [41]:
class experience_memory():

    def __init__(self, buffer_capacity, batch_size, state_dim, action_dim):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, state_dim), dtype=np.float32)
        self.action_buffer = np.zeros((self.buffer_capacity, action_dim), dtype=np.float32)
        self.reward_buffer = np.zeros((self.buffer_capacity, 1), dtype=np.float32)
        self.next_state_buffer = np.zeros((self.buffer_capacity, state_dim), dtype=np.float32)
        self.done_buffer = np.zeros((self.buffer_capacity, 1), dtype=np.float32)

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]


        self.buffer_counter += 1

class MountainCar():

    def __init__(self):
        
        self.env = gym.make("CartPole-v1")

        self.batch_size = 64
        self.max_memory_size = 10000

        self.state_dim = 4
        self.action_dim = 1

        self.gamma = 0.99
        self.tau = 0.01
        self.lower_action_bound = -1
        self.upper_action_bound = 1

        self.action_space = np.array([0,1])
        self.num_a = len(self.action_space)

        self.buffer = experience_memory(self.max_memory_size, self.batch_size, self.state_dim, self.action_dim)

        # init the neural netsf
        self.critic = self.get_critic_NN()
        self.target_critic = self.get_critic_NN()
        self.alpha = 1
        self.critic_optimizer = tf.keras.optimizers.Adam(self.alpha)

    #@tf.function
    def update(self, state_batch, action_batch, reward_batch, next_state_batch, done_batch):
             
        next_q_vals =  self.target_critic(next_state_batch, training=True)
        
        target_vals = tf.reduce_max(next_q_vals, axis =1)
        target_vals = tf.reshape(target_vals, [self.batch_size,1])
        y = reward_batch + (tf.ones_like(done_batch,dtype = np.float32)-done_batch)* self.gamma*target_vals
        
    
  
     

        
        self.critic.fit(state_batch,
        y = y,
        verbose = 0,
        batch_size =  self.batch_size)


        # with tf.GradientTape() as tape:
            
        #     next_q_vals =  self.target_critic(next_state_batch, training=True)
            
        #     target_vals = tf.reduce_max(next_q_vals, axis =1)
        #     y = reward_batch + (tf.ones_like(done_batch)-done_batch)* self.gamma*target_vals
           
        #     critic_value = tf.reduce_max(self.critic(state_batch, training=True),axis =1)
            
        #     critic_loss = tf.math.reduce_mean(tf.math.square(y- critic_value))
        
        
        # critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
        # self.critic_optimizer.apply_gradients(zip(critic_grad, self.critic.trainable_variables))

        
    def learn(self):
        # get sample

        record_range = min(self.buffer.buffer_counter, self.buffer.buffer_capacity)

        batch_indices = np.random.choice(record_range, self.batch_size)
        
        state_batch = tf.convert_to_tensor(self.buffer.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.buffer.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.buffer.reward_buffer[batch_indices])
        next_state_batch = tf.convert_to_tensor(self.buffer.next_state_buffer[batch_indices])
        done_batch = tf.convert_to_tensor(self.buffer.done_buffer[batch_indices])
        
        self.update(state_batch, action_batch, reward_batch, next_state_batch,done_batch)

    @tf.function
    def update_target(self, target_weights, weights):
        for (a,b) in zip(target_weights, weights):
            a.assign(self.tau *b + (1-self.tau) *a)

    def get_critic_NN(self):
        # input [state, action]
        
        state_input = layers.Input(shape =(self.state_dim,))

        out = layers.Dense(256, activation = 'relu')(state_input)
        #out = layers.BatchNormalization()(out)
        out = layers.Dense(256, activation = 'relu')(out)
        out = layers.Dense(self.num_a)(out)

        
        model = keras.Model(inputs = state_input, outputs = out)
        model.compile(
            optimizer='adam',
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.MeanSquaredError()],
        )
        return model

    def epsilon_greedy(self, state, eps):

        q_vals = self.critic(state)
        
        if (eps > np.random.rand()):
           
            rand_ind = np.random.choice(self.num_a)
            
            return self.action_space[rand_ind]
        
        else:
            
            a_ind = tf.argmax(q_vals,axis = 1)
           
            return self.action_space[a_ind]
    
    def run_MC(self):
        done = False
        state = self.env.reset()
        state = tf.expand_dims(tf.convert_to_tensor(state),0)
        t_counter = 0
        while (True):
            self.env.render()
            time.sleep(0.01)
            
            a_ind = np.argmax(self.critic(state))
            
            action = self.action_space[a_ind]
            new_state, reward, done, info = self.env.step(action)
        
            new_state = tf.expand_dims(tf.convert_to_tensor(new_state.reshape(self.state_dim)),0)
            state = new_state
            t_counter += 1

            if (done):
                break
        print('Zeit: ', t_counter)
        

In [42]:
MC = MountainCar()

ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
num_episode = 2000

for ep in range(num_episode):
    MC.env.reset()
    done = False
    state = MC.env.reset()
    #state = tf.expand_dims(tf.convert_to_tensor(state),0)
    state = np.reshape(state, [1,MC.state_dim])
    episodic_reward = 0
    t_counter = 0

    if (ep % 100 == 0 and ep != 0):
        MC.run_MC()
    while(True):
        MC.alpha = 0.0001
        eps = np.max([0.05,0.1 * (1 / (1 +ep))])
        action = MC.epsilon_greedy(state, eps)
        
        new_state, reward, done, info = MC.env.step(action)
        
        #new_state = tf.expand_dims(tf.convert_to_tensor(new_state.reshape(MC.state_dim)),0)
        new_state = np.reshape(new_state, [1,MC.state_dim])
        episodic_reward += reward
        
        MC.buffer.record((state,action,reward, new_state, done))
        if(MC.buffer.buffer_counter >= MC.batch_size):
            MC.learn()
        
            MC.update_target(MC.target_critic.variables, MC.critic.variables)

        state = new_state
        t_counter +=1
        if (done):
            break
    ep_reward_list.append(episodic_reward)
    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-30:])
    print("Episode * {} * AVG Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)
# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(ep_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()



  return self.action_space[a_ind]


Episode * 0 * Reward is ==> 16.0
Episode * 1 * Reward is ==> 18.0
Episode * 2 * Reward is ==> 10.0
Episode * 3 * Reward is ==> 28.0
Episode * 4 * Reward is ==> 15.0
Episode * 5 * Reward is ==> 11.0
Episode * 6 * Reward is ==> 10.0
Episode * 7 * Reward is ==> 13.0
Episode * 8 * Reward is ==> 14.0
Episode * 9 * Reward is ==> 10.0
Episode * 10 * Reward is ==> 22.0
Episode * 11 * Reward is ==> 11.0
Episode * 12 * Reward is ==> 13.0
Episode * 13 * Reward is ==> 10.0
Episode * 14 * Reward is ==> 33.0
Episode * 15 * Reward is ==> 15.0
Episode * 16 * Reward is ==> 46.0
Episode * 17 * Reward is ==> 26.0
Episode * 18 * Reward is ==> 13.0
Episode * 19 * Reward is ==> 15.0
Episode * 20 * Reward is ==> 39.0
Episode * 21 * Reward is ==> 18.0
Episode * 22 * Reward is ==> 29.0
Episode * 23 * Reward is ==> 21.0
Episode * 24 * Reward is ==> 12.0
Episode * 25 * Reward is ==> 13.0
Episode * 26 * Reward is ==> 29.0
Episode * 27 * Reward is ==> 16.0
Episode * 28 * Reward is ==> 37.0
Episode * 29 * Reward is

KeyboardInterrupt: 

In [212]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gym # for environment
from collections import deque
from keras.layers import Dense

import random

In [213]:
class DQLAgent(): 
    
    def __init__(self, env):
        # parameters and hyperparameters
        
        # this part is for neural network or build_model()
        self.state_size = env.observation_space.shape[0] # this is for input of neural network node size
        self.action_size = env.action_space.n # this is for out of neural network node size
        
        # this part is for replay()
        self.gamma = 0.95
        self.learning_rate = 0.001
        
        # this part is for adaptiveEGreedy()
        self.epsilon = 1 # initial exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000) # a list with 1000 memory, if it becomes full first inputs will be deleted
        
        self.model = self.build_model()
    
    def build_model(self):
        # neural network for deep Q learning
        model = keras.Sequential()
        model.add(Dense(48, input_dim = self.state_size, activation = 'tanh')) # first hidden layer
        model.add(Dense(self.action_size, activation = 'linear')) # output layer
        model.compile(loss = 'mse', optimizer = tf.keras.optimizers.Adam(lr = self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        # storage
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        # acting, exploit or explore
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
            
    
    def replay(self, batch_size):
        # training
        
        if len(self.memory) < batch_size:
            return # memory is still not full
        
        minibatch = random.sample(self.memory, batch_size) # take 16 (batch_size) random samples from memory
        for state, action, reward, next_state, done in minibatch:
            if done: # if the game is over, I dont have next state, I just have reward 
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) 
                # target = R(s,a) + gamma * max Q`(s`,a`)
                # target (max Q` value) is output of Neural Network which takes s` as an input 
                # amax(): flatten the lists (make them 1 list) and take max value
            train_target = self.model.predict(state) # s --> NN --> Q(s,a)=train_target
            train_target[0][action] = target
            self.model.fit(state, train_target, verbose = 0) # verbose: dont show loss and epoch
    
    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            

if __name__ == "__main__":
    
    # initialize gym environment and agent
    env = gym.make('CartPole-v0')
    agent = DQLAgent(env)

    batch_size = 16
    episodes = 50
    for e in range(episodes):
        
        # initialize environment
        state = env.reset()
        state = np.reshape(state, [1,4])
        
        time = 0 # each second I will get reward, because I want to sustain a balance forever
        while True:
            
            # act
            action = agent.act(state)
            
            # step
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            
            # remember / storage
            agent.remember(state, action, reward, next_state, done)
            
            # update state
            state = next_state
            
            # replay
            agent.replay(batch_size)
            
            # adjust epsilon
            agent.adaptiveEGreedy()
            
            time += 1
            
            if done:
                print('episode: {}, time: {}'.format(e, time))
                break

  logger.warn(
  super(Adam, self).__init__(name, **kwargs)


episode: 0, time: 11
episode: 1, time: 15
episode: 2, time: 25
episode: 3, time: 10
episode: 4, time: 40
episode: 5, time: 46
episode: 6, time: 21
episode: 7, time: 29
episode: 8, time: 20
episode: 9, time: 12
episode: 10, time: 18
episode: 11, time: 36
episode: 12, time: 68
episode: 13, time: 63
episode: 14, time: 49
episode: 15, time: 39
episode: 16, time: 24
episode: 17, time: 24
episode: 18, time: 49
episode: 19, time: 59
episode: 20, time: 30
episode: 21, time: 62
episode: 22, time: 32
episode: 23, time: 65
episode: 24, time: 67
episode: 25, time: 65
episode: 26, time: 63
episode: 27, time: 36
episode: 28, time: 56
episode: 29, time: 66
episode: 30, time: 47
episode: 31, time: 54
episode: 32, time: 55
episode: 33, time: 121
episode: 34, time: 60
episode: 35, time: 94
episode: 36, time: 75
episode: 37, time: 146
episode: 38, time: 114
episode: 39, time: 123
episode: 40, time: 200
episode: 41, time: 200
episode: 42, time: 200
episode: 43, time: 200
episode: 44, time: 184


KeyboardInterrupt: 