In [1]:
from keras.models import Sequential,Model
from keras.layers import Dropout,Dense,Input,Activation
from keras.optimizers import Adam
import numpy as np
from collections import deque
import random

class Walker:
    def __init__(self,nx,ny,lr,gamma):
        self.nx = nx
        self.ny = ny
        self.lr = lr
        self.los = []
        self.gamma = gamma
        self.memory_deck = deque(maxlen=2000)
        self.epsilon = 0.7
        self.epsilon_ = 0.01
        self.decay = 0.995
        self.model = self.get_model()
        self.episode_observation, self.episode_rewards, self.episode_action, self.new_episode_observation,self.episode_flag = [],[],[],[],[]

    def get_action(self,observation):
        if np.random.rand()<=self.epsilon:
            return np.random.uniform(-1,1,4)
        p = self.model.predict(observation)
        return p[0]
        
    def memory_recall(self,observation,action,reward,new_observation,flags):
        self.memory_deck.append((observation,action,reward,new_observation,flags))
        self.episode_rewards.append(reward)
    
    def get_model(self):
        model = Sequential()
        model.add(Dense(400,input_dim=self.nx,activation='relu'))
        model.add(Dense(300,activation='relu'))
        model.add(Dense(self.ny,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.lr))
        return model
    
    def training(self,batch):
        i = random.sample(self.memory_deck,batch)
        self.los = []
        for obs,act,rew,new_obs,done in i:
            target = rew
            if not done:
                target = ((1.0-0.1)*rew+0.1*(self.gamma*np.amax(self.model.predict(new_obs)[0])))
            
            old_target = self.model.predict(obs)
            old_target[0] = target
            history = self.model.fit(x=obs,y=old_target,verbose=0,epochs=1)
            self.los.append(history.history['loss'])
            self.episode_observation, self.episode_rewards, self.episode_action, self.new_episode_observation,self.episode_flag = [],[],[],[],[]

        mm = np.mean(self.los)
        if self.epsilon>=self.epsilon_:
            self.epsilon*=self.decay
        return history,mm

In [None]:
import gym
import numpy as np
import random
import time

seed = np.random.seed(666)

episodes = 10000
render = False

env = gym.make('BipedalWalker-v3')
env = env.unwrapped

lr = 0.001
gamma = 0.98
nx = env.observation_space.shape[0]
ny = env.action_space.shape[0]
agent = Walker(nx,ny,lr,gamma)
win=0
rewards_over_time = []

for i in range(episodes):
    observation = env.reset()
    observation = observation.reshape(1,-1)
    start = time.time()
    while True:
        if render==True:
            env.render()
        
        action = agent.get_action(observation)
        new_observation,reward,flag,inf = env.step(action)
        new_observation = new_observation.reshape(1,-1)
        agent.memory_recall(observation,action,reward,new_observation,flag)
        observation = new_observation
        
        end = time.time()
        t = end-start
        if t>20:
            flag=True
        
        total_episode_rewards = sum(agent.episode_rewards)
        if total_episode_rewards<-300:
            flag = True
        
        if flag == True:
            rewards_over_time.append(total_episode_rewards)
            max_reward = np.max(rewards_over_time)
            if int(total_episode_rewards)>270 and i>2000:
                render=True
            episode_max = np.argmax(rewards_over_time)
            if total_episode_rewards>=300:
                win=win+1
            print('##################################################################')
            print('Walk# : ',i)
            print('Reward : ',int(total_episode_rewards))
            print('Time : ',np.round(t,2),'sec')
            print('Maximum Reward : '+str(int(max_reward))+'       (in episode#:'+str(episode_max)+')')
            print('Wins : '+str(win))

            hist,mm = agent.training(16)
            #if max_reward > 100: render = True
            break



##################################################################
Walk# :  0
Reward :  -85
Time :  20.03 sec
Maximum Reward : -85       (in episode#:0)
Wins : 0
##################################################################
Walk# :  1
Reward :  -90
Time :  20.0 sec
Maximum Reward : -85       (in episode#:0)
Wins : 0
##################################################################
Walk# :  2
Reward :  -81
Time :  20.01 sec
Maximum Reward : -81       (in episode#:2)
Wins : 0
##################################################################
Walk# :  3
Reward :  -113
Time :  20.02 sec
Maximum Reward : -81       (in episode#:2)
Wins : 0
##################################################################
Walk# :  4
Reward :  -73
Time :  20.01 sec
Maximum Reward : -73       (in episode#:4)
Wins : 0
##################################################################
Walk# :  5
Reward :  -78
Time :  20.01 sec
Maximum Reward : -73       (in episode#:4)
Wins : 0
############################

##################################################################
Walk# :  51
Reward :  -100
Time :  0.41 sec
Maximum Reward : -45       (in episode#:43)
Wins : 0
##################################################################
Walk# :  52
Reward :  -104
Time :  1.23 sec
Maximum Reward : -45       (in episode#:43)
Wins : 0
##################################################################
Walk# :  53
Reward :  -98
Time :  1.32 sec
Maximum Reward : -45       (in episode#:43)
Wins : 0
##################################################################
Walk# :  54
Reward :  -47
Time :  20.03 sec
Maximum Reward : -45       (in episode#:43)
Wins : 0
##################################################################
Walk# :  55
Reward :  -111
Time :  0.87 sec
Maximum Reward : -45       (in episode#:43)
Wins : 0
##################################################################
Walk# :  56
Reward :  -64
Time :  20.0 sec
Maximum Reward : -45       (in episode#:43)
Wins : 0
##################