In [None]:
import tensorflow.python.keras.backend as K
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

from tensorflow.keras.layers import Dense, Input, Add
from tensorflow.keras.models import  Model
from tensorflow.keras.optimizers import Adam

import random
import gym
from collections import deque
import numpy as np

import os
import pickle
import matplotlib
import matplotlib.pyplot as plt
from os import path
!pip3 install box2d-py

In [None]:
if not path.isdir('test'):
    os.mkdir('test/')

# DDPG

In [None]:
class Actor:
    def __init__(self, env,sess, learning_rate, tau):
        self.sess = sess
        K.set_session(sess)
        self.env = env

        self.learning_rate = learning_rate
        self.tau = tau

        self.model, self.state_input = self.create_model(self.env)
        self.target,_ = self.create_model(self.env)

        self.target.set_weights(self.model.get_weights())

        self.critic_grads = tf.placeholder(tf.float32,[None, self.env.action_space.shape[0]])

        self.params_grads = tf.gradients(self.model.output, self.model.trainable_weights,-self.critic_grads)

        self.grads = zip(self.params_grads, self.model.trainable_weights)

        self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients\
        (zip(self.params_grads, self.model.trainable_weights))
        self.sess.run(tf.initialize_all_variables())

    def create_model(self, env):
        state_input = Input(shape=env.observation_space.shape)
        h1 = Dense(32, activation='relu')(state_input)
        h2 = Dense(256, activation='relu')(h1)
        h3 = Dense(32, activation='relu')(h2)
        output = Dense(env.action_space.shape[0], activation='tanh')(h3)

        model = Model(state_input,output)
        adam = Adam(lr=0.00001)
        model.compile(loss="mse", optimizer=adam)
        return model, state_input

    def predict_action(self, state):
        action = self.model.predict(state)
        return action

    def predict_next_action(self, next_state):
        next_action = self.target.predict(next_state)
        return next_action

    def train(self, batch, critic_grads):
        current_states,_,_,_,_,_ = batch
        self.sess.run(self.optimize, feed_dict={self.state_input: current_states,self.critic_grads: critic_grads})
        grads = self.sess.run(self.params_grads, feed_dict={self.state_input: current_states,self.critic_grads: critic_grads})
        return grads

    def update_target(self):
        actor_weights = self.model.get_weights()
        target_weights = self.target.get_weights()
        for i in range(len(actor_weights)):
            target_weights[i] = self.tau * actor_weights[i] + (1 - self.tau) * target_weights[i]
        self.target.set_weights(target_weights)

    def save_model_architecture(self, file_name):
        model_json = self.model.to_json()
        with open(file_name, "w") as json_file:
            json_file.write(model_json)

    def save_weights(self, file_name):
        self.model.save_weights(file_name)

In [None]:
class Critic:
    def __init__(self,env,sess, learning_rate, tau, discount_factor):
       self.sess = sess
       K.set_session(sess)
       self.env = env

       self.gamma = discount_factor
       self.tau = tau
       self.learning_rate = learning_rate
       self.model, self.state_input, self.action_input = self.create_model(self.env)

       self.target, _,_ = self.create_model(self.env)
       self.target.set_weights(self.model.get_weights())

       self.gradients = tf.gradients(self.model.output, self.action_input)
       self.sess.run(tf.initialize_all_variables())

    def create_model(self,env):
        state_input = Input(shape=env.observation_space.shape)
        state_h1 = Dense(32, activation='relu')(state_input)
        state_h2 = Dense(64)(state_h1)

        action_input = Input(shape=env.action_space.shape)
        action_h1 = Dense(64)(action_input)

        merged = Add()([state_h2, action_h1])
        m2 = Dense(256,activation='relu')(merged)
        merged_h1 = Dense(32, activation='relu')(m2)
        output = Dense(1, activation='linear')(merged_h1)
        model = Model([state_input, action_input], output)

        adam = Adam(lr=self.learning_rate)
        model.compile(loss="mse", optimizer=adam)
        return model, state_input, action_input

    def predict_qvalue(self,state,action):
        qvalue = self.model.predict([state, action])
        return qvalue

    def predict_next_qvalue(self,next_state, next_action):
        next_qvalue = self.target.predict([next_state, next_action])
        return next_qvalue

    def train(self, batch):
        current_states, actions, rewards, next_states, next_actions, dones = batch

        next_qvalues = self.predict_next_qvalue(next_states, next_actions)
        next_qvalues[dones] = 0

        td_targets = rewards + self.gamma*next_qvalues

        self.model.fit([current_states,actions], td_targets, verbose=0)

    def calc_grads(self,batch):
        current_states,actions,_,_,_,_ = batch
        grads = self.sess.run(self.gradients, feed_dict={self.state_input: current_states,self.action_input: actions})[0]
        return grads

    def update_target(self):
        critic_weights = self.model.get_weights()
        target_weights = self.target.get_weights()
        for i in range(len(critic_weights)):
            target_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * target_weights[i]
        self.target.set_weights(target_weights)

    def save_model_architecture(self, file_name):
        model_json = self.model.to_json()
        with open(file_name, "w") as json_file:
            json_file.write(model_json)

    def save_weights(self, file_name):
        self.model.save_weights(file_name)

In [None]:
class ReplayMemory:
    def __init__(self, size):
        self.memory = deque(maxlen=size)

    def append(self,state, action, reward, next_state, next_action, done):
        self.memory.append([state,action,reward,next_state, next_action, done])

    def sample(self,batch_size):
        batch = random.sample(self.memory,batch_size)
        current_states = []
        rewards = []
        actions = []
        next_states = []
        next_actions = []
        dones = []

        for sample in batch:
            state, action, reward, next_state, next_action, done = sample
            current_states.append(state)
            rewards.append(reward)
            actions.append(action)
            next_states.append(next_state)
            next_actions.append(next_action)
            dones.append(done)
        current_states = np.array(current_states)
        next_states = np.array(next_states)
        actions = np.array(actions)
        next_actions = np.array(next_actions)

        rewards = np.array(rewards).reshape((batch_size,1))
        dones = np.array(dones).reshape((batch_size,1))

        return [current_states,actions, rewards, next_states, next_actions, dones]

    def count(self):
        return len(self.memory)

In [None]:
class OUNoise:
    def __init__(self, action_space, mu=0.0, theta=0.05, max_sigma=0.25, min_sigma=0.25, decay_period=100000):
        self.mu = mu
        self.theta = theta
        self.sigma = max_sigma
        self.max_sigma = max_sigma
        self.min_sigma = min_sigma
        self.decay_period = decay_period
        self.action_dim = action_space.shape[0]
        self.low = action_space.low
        self.high = action_space.high
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def evolve_state(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state

    def get_action(self, action, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * max(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)

In [None]:
class GaussNoise:
    def __init__(self,mu=0,std=0.25,sigma=0.25):
        self.mu = mu
        self.std = std
        self.reset()
        self.sigma=sigma
    def reset(self):
        self.state = self.mu
    
    def noise(self):
        return np.random.normal(self.mu,self.std)
    
    def get_action(self,action,t=0):
        return np.clip(action+self.sigma*self.noise(),-1,1)

In [None]:

def main(env,noise=0):
    sess = tf.Session()
    K.set_session(sess)
    memory_size = 100000
    batch_size = 64
    tau = 0.001
    lr_actor = 0.00001
    lr_critic = 0.0001
    discount_factor = 0.99
    episodes = 3000
    time_steps = 1000
    collect_experience = 50000
    save_frequency = 250
    ep_reward = []
    training = False
    name='LLC'
    size = env.observation_space.shape[0]
    if noise==0:
        noise = OUNoise(env.action_space)
    else:
        noise = GaussNoise()
    actor = Actor(env,sess, lr_actor, tau)

    critic = Critic(env,sess, lr_critic, tau, discount_factor)
    if env.observation_space.shape[0]==2:
        win_score=90
        episodes = 1500
        lr_actor = 0.0001
        lr_critic = 0.001
        batch_size=32
        name='MCC'
    elif env.observation_space.shape[0]==8:
        win_score=200
    replay_memory = ReplayMemory(memory_size)
    steps_taken=[]
    avgrewards=[]
    for episode in range (episodes):
        state = env.reset()
        noise.reset()
        episode_reward = 0
        done = False
        for time in range(1000):
            action = actor.predict_action(state.reshape((1,size)))[0]
            exploratory_action = noise.get_action(action,time)
            next_state ,reward, done, _ = env.step(exploratory_action)
            episode_reward+=reward
            next_action = actor.predict_next_action(next_state.reshape((1,size)))[0]
            replay_memory.append(state,exploratory_action,reward,next_state, next_action,done)
            if episode==batch_size:
                training = True
            if training:
                batch = replay_memory.sample(batch_size)
                grads = critic.calc_grads(batch)
                t_grads = actor.train(batch,grads)
                critic.train(batch)
                actor.update_target()
                critic.update_target()
            state = next_state
            if done and episode_reward>90:
                if episode > 2*batch_size:
                    actor.save_weights('test/DDPG_actor_model_{}-win-{}-{}-noise-{}.h5'.format(episode,time_steps,name,noise))
                break
            steps_taken.append(time)
        ep_reward.append(episode_reward)
        avgrewards.append(np.mean(ep_reward[-10:]))
        if done and np.mean(ep_reward[-100:])>win_score and episode>128:
            actor.save_weights('test/DDPG_actor_model_{}-win-{}-{}-noise-{}.h5'.format(episode,time_steps,name,noise))
            print('That was a good enough model')
            print(np.mean(ep_reward[-10:]), ep_reward[-10:])
            break
        print("Episode: {},\t steps {},\t episode reward: {:.2f} \t Running avg(100) {:.2f}"
              .format( episode,time, episode_reward,np.mean(ep_reward[-100:])))

        if training and (episode % save_frequency) == 0:
            print('Data saved at epsisode:', episode)
            actor.save_weights('test/DDPG_actor_model_{}-win-{}-{}-noise-{}.h5'.format(episode,time_steps,name,noise))
            
    env.close()
    plt.plot(ep_reward,label='rewards')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.show()
    plt.plot(avgrewards,label='rewards')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.show()
    return actor

# Lunar Lander Continuous

In [None]:
env = gym.make('LunarLanderContinuous-v2')

In [None]:
actorllc_ou = main(env,0)

In [None]:
actorllc_ou = main(env,1)

# Mountain Car Continuous

In [None]:
env = gym.make('MountainCarContinuous-v0')

In [None]:
actor_ou=main(env,0)

In [None]:
actor_gauss=main(env,1)

# Test

In [None]:
def test(model,episodes,steps,size):
    wins=0
    rewards=[]
    testenv = gym.make('MountainCarContinuous-v0')
    for ep in range(episodes):
        total=0
        state=testenv.reset()
        for step in range(steps):
            action = model.predict(state.reshape((1,size)))[0]
            new_state, reward, done,_=testenv.step(action)
            
            if ep%10==0:
                env.render()
            
            state=new_state
            
            total+=reward
            if done:
                wins+=1
                print(f'Won in episode {ep} Steps {step} Reward{total}')
        print(f'Episode {ep} Reward {total}')
        rewards.append(reward)
    print(f'Win {wins} percent {(wins/episodes)*100} mean rew {np.mean(rewards)}')
