In [1]:
import gym
from gym import wrappers
import numpy as np
import random
from collections import deque

In [8]:
class ANN:
    def __init__(self, input_size, hidden_size, output_size):
        self.learning_rate = 0.995
        self.epsilon = 0.35 
        self.decay_factor = 0.9975

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Initialize weights
        self.W1 = np.random.randn(hidden_size, input_size) * np.sqrt(1/input_size)
        self.b1 = np.zeros((hidden_size, ))
        self.W2 = np.random.randn(output_size, hidden_size) * np.sqrt(1/output_size)
        self.b2 = np.zeros((output_size, ))

        self.replay_buffer = deque(maxlen=1000)

    def forward(self, observation):
        self.hidden_in = np.dot(self.W1, observation) + self.b1
        self.hidden_out = self.relu(self.hidden_in)
        self.output_in = np.dot(self.W2, self.hidden_out) + self.b2        
        self.output_out = self.sigmoid(self.output_in)
        return self.output_out
    
    def backprop(self, observation, Error):
        """
        # Backpropagation
        reluOutput = self.sigmoid_backward(self.output_in)
        #print("Shape of ReluOutput: ", reluOutput.shape)
        delta2 = reluOutput * Error 
        #print("Shape of delta 2:", delta2.shape)
        dw2 = np.outer(delta2, self.hidden_out)
        #print("Shape of dw2:", dw2.shape)
        db2 = delta2
        #print("shape of db2:", db2.shape)
        # Backpropagation: hidden layer -> input layer
        reluInput = self.relu_backward(self.hidden_in)
        #print("Shape of ReluInput: ", reluInput.shape)
        delta1 = reluInput * np.dot(self.W2.T, delta2)
        #print("Shape of delta 1:", delta1.shape)
        dw1 = np.outer(delta1, observation)
        #print("Shape of dw1:", dw1.shape)
        db1 = delta1
        #print("Shape of db1:", db1.shape)"""
       
        reluOutput = self.sigmoid_backward(self.output_in)    
        delta2 = reluOutput * Error

        dw2 = np.outer(delta2, self.hidden_out)
        db2 = delta2

        reluInput = self.relu_backward(self.hidden_in)
        delta1 = reluInput * np.dot(self.W2.T, delta2)

        dw1 = np.outer(delta1, observation)
        db1 = delta1

        # Update Rules
        self.W1 += self.learning_rate * dw1
        self.W2 += self.learning_rate * dw2
        self.b1 += self.learning_rate * db1
        self.b2 += self.learning_rate * db2  

    def store_experience(self, state, Error):
        experience = (state, Error)
        self.replay_buffer.append(experience)
    
    def sample_experiences(self):
        return random.sample(self.replay_buffer, 10)
    
    def update_net(self):
        sample = random.sample(self.replay_buffer, 10)
        for (state, Error) in sample:
            self.backprop(state, Error)

    def ep_greedy(self, Q_values):
        if random.random() < self.epsilon:
            return random.randint(0, 1)
        else: 
            return np.argmax(Q_values)
        
    def decay(self):
        self.epsilon *= self.decay_factor

    def softmax(self, Z):
        e_Z = np.exp(Z - np.max(Z, axis=0))
        return e_Z / np.sum(e_Z, axis=0)
    
    def sigmoid(self, Z):
        return 1/(1+np.exp(-Z))

    def relu(self, Z):
        return np.maximum(0,Z)

    def sigmoid_backward(self, layer):
        sig = self.sigmoid(layer)
        return sig * (1 - sig)

    def relu_backward(self, layer):
        return np.where(layer > 0, 1, 0)


In [15]:
gamma = 0.85  
env = gym.make('CartPole-v1', render_mode="human")
observation = env.reset()

s_size = env.observation_space.shape[0]
a_size = env.action_space.n
hidden_size = 8
net = ANN(s_size, hidden_size, a_size)

for i in range(1, 10000):
    done = False
    observation = env.reset()
    reward_avg = 0
    observation = observation[0]
    env.render()

    while not done:
        net.decay()

        # Forward prop
        Q_values = net.forward(observation)
        
        # Policy Decision
        Q_current = np.max(Q_values)
        action = net.ep_greedy(Q_values)
        
        # Next step
        observation_next, reward, done, info, _ = env.step(action)
        reward_avg += reward
        
        if not done: 
            Error = reward - Q_current
            net.store_experience(observation, Error)
        else:
            Q_values_next = net.forward(observation_next)
            Q_max = np.max(Q_values_next)
            Q_target = reward + gamma * Q_max 
            Error = Q_target - Q_current 
            net.store_experience(observation, Error)

        if i % 100 == 0:
            net.update_net()
            print(reward_avg / 100)
            reward_avg = 0
        
        observation = observation_next
env.close()

0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01
0.01


In [23]:
for i in gym.envs.registry:
    print(i)

CartPole-v0
CartPole-v1
MountainCar-v0
MountainCarContinuous-v0
Pendulum-v1
Acrobot-v1
LunarLander-v2
LunarLanderContinuous-v2
BipedalWalker-v3
BipedalWalkerHardcore-v3
CarRacing-v2
Blackjack-v1
FrozenLake-v1
FrozenLake8x8-v1
CliffWalking-v0
Taxi-v3
Reacher-v2
Reacher-v4
Pusher-v2
Pusher-v4
InvertedPendulum-v2
InvertedPendulum-v4
InvertedDoublePendulum-v2
InvertedDoublePendulum-v4
HalfCheetah-v2
HalfCheetah-v3
HalfCheetah-v4
Hopper-v2
Hopper-v3
Hopper-v4
Swimmer-v2
Swimmer-v3
Swimmer-v4
Walker2d-v2
Walker2d-v3
Walker2d-v4
Ant-v2
Ant-v3
Ant-v4
Humanoid-v2
Humanoid-v3
Humanoid-v4
HumanoidStandup-v2
HumanoidStandup-v4


In [34]:

env = gym.make('LunarLander-v2', render_mode="human")
observation = env.reset()
print(observation)
print(env.action_space)
print(env.action_space.sample())

(array([ 1.3648033e-03,  1.4137057e+00,  1.3822515e-01,  1.2380719e-01,
       -1.5746783e-03, -3.1310089e-02,  0.0000000e+00,  0.0000000e+00],
      dtype=float32), {})
Discrete(4)
3
