In [1]:
import gym
import random
import numpy as np
from collections import deque

print("Gym:", gym.__version__)

Gym: 0.17.2


In [2]:
env_name = "Pendulum-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(3,)
Action space: Box(1,)


In [3]:
env.observation_space.sample()

array([ 0.44322655,  0.43647087, -0.9285762 ], dtype=float32)

In [4]:
class QNetwork():
    def __init__(self,state_dim):
        self.state_dim=state_dim
        
        
    def update_model(self,state, action, q_target):
        
        x2=state
        Y=q_target
        grads=backward_propagation(x2,Y,self.cache)
        
        self.parameters=update_parameters2_without_optimization(self.parameters, grads, learning_rate=0.009)
        
        #q_state=forward_propagation(self.state_in,parameters)
        
        #session.run(self.optimizer, feed_dict=feed)
        
    def get_q_state(self,state):
        state=np.array(state)
        self.state_in=state
        #print('error state shape',state.shape)
        layer_dims=[3,7,7,1]
        self.parameters=initialize_parameters(layer_dims)
        q_state,self.cache=forward_propagation2(self.state_in,self.parameters)
        #print('q',q_state)
        return q_state

In [5]:
random.random()

0.33639645639305293

In [6]:
class DQNAgent():
    def __init__(self, env):
        self.state_dim = env.observation_space.shape
        #self.action_size = env.action_space.n
        self.q_network = QNetwork(self.state_dim)
        self.replay_buffer = ReplayBuffer(maxlen=10000)
        self.gamma = 0.97
        self.eps = 1.0
        
        #self.sess = tf.Session()
        #self.sess.run(tf.global_variables_initializer())
        
    def get_action(self, state):
        #q_state = self.q_network.get_q_state(self.sess, [state])
        q_state = self.q_network.get_q_state([state])
        action_greedy = np.argmax(q_state)
        action_random = np.random.randint(-2,2)
        action = action_random if random.random() < self.eps else action_greedy
        return action
    
    def train(self, state, action, next_state, reward, done):
        self.replay_buffer.add((state, action, next_state, reward, done))
        states, actions, next_states, rewards, dones = self.replay_buffer.sample(50)
        
        q_next_states = self.q_network.get_q_state((next_states))
        
        q_targets = rewards + self.gamma * np.max(q_next_states, axis=1)
        self.q_network.update_model( states, actions, q_targets)
        if done: self.eps = max(0.1, 0.99*self.eps)
    
    

In [7]:
class ReplayBuffer():
    def __init__(self, maxlen):
        self.buffer = deque(maxlen=maxlen)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        samples = random.choices(self.buffer, k=sample_size)
        return map(list, zip(*samples))

In [8]:
def initialize_parameters(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims) # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*  np.sqrt(2 / (layer_dims[l-1]))
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        assert parameters['W' + str(l)].shape[0] == layer_dims[l], layer_dims[l-1]
        assert parameters['W' + str(l)].shape[0] == layer_dims[l], 1
        
    return parameters

In [9]:
def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- activation function
    
    """
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    
    return A



In [10]:
def tanh(Z):
     
    
    A = (np.exp(Z)-np.exp(-Z))/(np.exp(Z)+np.exp(-Z))
    return A

In [11]:
def forward_propagation2(X,parameters):
    a0=X
    L=len(parameters)//2
        
    Z1=np.dot(parameters['W1'],a0.T)+parameters['b1']
    a1=relu(Z1)
    Z2=np.dot(parameters['W2'],a1)+parameters['b2']
    a2=relu(Z2)
    Z3=np.dot(parameters['W3'],a2)+parameters['b3']
    a3=2*tanh(Z3)
    cache=(a1,parameters['W1'],parameters['b1'],Z1,a2,parameters['W2'],parameters['b2'],Z2,a3,parameters['W3'],parameters['b3'],Z3)
    #print('a3',a3)
    return a3,cache

In [12]:
def compute_cost(AL,Y):
    """
    

    Arguments:
    AL -- predicted value (vector)
    Y -- ground truth

    Returns:
    cost
    """
    
    #m = Y.shape[1] #no of examples

    # Compute loss from aL and y.
    ### START CODE HERE ### (≈ 1 lines of code)
    cost =  np.sum((Y-AL)**2)
    ### END CODE HERE ###
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect 
    return cost

In [13]:
def relu_backward(dA, cache):
    """
    

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    #Z=np.squeeze(Z)
    if(dZ.shape != Z.shape):
        print('dz shape',dZ.shape)
        print('z shape',Z.shape)
        dZ=dZ[:,:,0]
        print(dZ.shape)
        
    
    return dZ

In [14]:
def tanh_backward(dA,cache):

    Z=cache
    dZ=dA*4*np.exp(2*Z)/((np.exp(2*Z)+1)**2 )   
    return dZ

In [15]:
def backward_propagation(x2,Y,cache):
    
    x2=np.array(x2)
    x2=np.transpose(x2)
    (a1,W1,b1,Z1,a2,W2,b2,Z2,a3,W3,b3,Z3)=cache
    dA3 = - 2*(Y-a3)
    #print('dA3',dA3.shape)
    #print('Z3',Z3.shape)
    dz3=tanh_backward(dA3,Z3)
    #print('dz3',dz3.shape)
    #print('a2',a2.shape)
    dW3 = np.dot(a2, np.squeeze(dz3.T))
    db3 = np.sum(dz3, axis=1, keepdims = True)
    
    da2 = np.dot(W3.T, dz3)
    dz2 = relu_backward(da2,Z2)
    dW2 = np.dot(dz2, a1.T)
    db2 = np.sum(dz2, axis=1, keepdims = True)
    
    da1 = np.dot(W2.T, dz2)
    dz1 = relu_backward(da1,Z1)
    dW1 = np.dot(dz1, x2.T)
    db1 = np.sum(dz1, axis=1, keepdims = True)
    
    gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
                 "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
                 "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
    
    return gradients

In [16]:
def update_parameters2_without_optimization(parameters, grads, learning_rate=0.009):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing the  parameters 
    grads -- python dictionary containing the gradients
    
    Returns:
    parameters -- python dictionary containing the updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network
    
    # Update rule for each parameter
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - (learning_rate* grads["dW" + str(l + 1)])
        parameters["b" + str(l+1)] =  parameters["b" + str(l + 1)] - (learning_rate* grads["db" + str(l+1)])
    
    return parameters

In [17]:
#OPTIMIZER ADAM
def initialize_adam(parameters) :
    """
    Initializes v and s as two python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL" 
                - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
    
    Arguments:
    parameters -- python dictionary containing your parameters.
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    
    Returns: 
    v -- python dictionary that will contain the exponentially weighted average of the gradient.
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...

    """
    
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
    
        v["dW" + str(l+1)] = np.zeros((parameters['W' + str(l+1)].shape[0], parameters['W' + str(l+1)].shape[1]))
        v["db" + str(l+1)] = np.zeros((parameters['b' + str(l+1)].shape[0], parameters['b' + str(l+1)].shape[1]))
        s["dW" + str(l+1)] = np.zeros((parameters['W' + str(l+1)].shape[0], parameters['W' + str(l+1)].shape[1]))
        s["db" + str(l+1)] = np.zeros((parameters['b' + str(l+1)].shape[0], parameters['b' + str(l+1)].shape[1]))
    
    
    return v, s

In [18]:
#UPDATING PARAMETERS WITH ADAM FOR L LAYERS
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate ,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    """
    Update parameters using Adam
    
    Arguments:
    parameters -- python dictionary containing your parameters:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    learning_rate -- the learning rate, scalar.
    beta1 -- Exponential decay hyperparameter for the first moment estimates 
    beta2 -- Exponential decay hyperparameter for the second moment estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates

    Returns:
    parameters -- python dictionary containing your updated parameters 
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    """
    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        
        v["dW" + str(l+1)] = beta1*v["dW" + str(l+1)]+(1-beta1)*grads["dW"+str(l+1)]
        v["db" + str(l+1)] = beta1*v["db" + str(l+1)]+(1-beta1)*grads["db"+str(l+1)]
        

        
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)]/(1-(beta1**t))
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)]/(1-(beta1**t))
        
        s["dW" + str(l+1)] = beta2*s["dW" + str(l+1)]+(1-beta2)*(grads["dW"+str(l+1)]**2)
        s["db" + str(l+1)] = beta2*s["db" + str(l+1)]+(1-beta2)*(grads["db"+str(l+1)]**2)
        
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)]/(1-(beta2**t))
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)]/(1-(beta2**t))
        print('np',np)
        print('inside _sqrt function',s_corrected["dW" + str(l+1)])
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] -learning_rate*(v_corrected["dW" + str(l+1)]/((np.sqrt(s_corrected["dW" + str(l+1)]) )+epsilon))
       # print('np',np)
       # print('inside _sqrt function',s_corrected["dW" + str(l+1)])
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] -learning_rate*(v_corrected["db" + str(l+1)]/((np.sqrt(s_corrected["db" + str(l+1)]) )+epsilon)) 
        parameters["W" + str(l+1)][np.isnan(parameters["W" + str(l+1)])] = 0
        parameters["b" + str(l+1)][np.isnan(parameters["b" + str(l+1)])]=0
        
        
        
        
    return parameters, v, s

In [20]:
agent = DQNAgent(env)
num_episodes = 2000

for ep in range(num_episodes):
    state = env.reset()
    #print('state shape',state.shape)
    total_reward = 0
    done = False
    
    while not done:
        action = agent.get_action(state)
        #print('action',action)
        next_state, reward, done, info = env.step([action])
        #print('next state shape',next_state.shape)
        next_state=np.squeeze(next_state)
        agent.train(state, action, next_state, reward, done)
        #t=t+1
        env.render()
        total_reward += reward
        state = next_state
        
    print("Episode: {}, total_reward: {:.2f}".format(ep, total_reward.item()))

Episode: 0, total_reward: -1177.24
Episode: 1, total_reward: -631.94
Episode: 2, total_reward: -1501.49
Episode: 3, total_reward: -970.62
Episode: 4, total_reward: -1549.31
Episode: 5, total_reward: -1824.75
Episode: 6, total_reward: -503.47
Episode: 7, total_reward: -1611.00
Episode: 8, total_reward: -1042.46
Episode: 9, total_reward: -1183.94
Episode: 10, total_reward: -969.89
Episode: 11, total_reward: -1354.03
Episode: 12, total_reward: -1736.33
Episode: 13, total_reward: -1681.31
Episode: 14, total_reward: -1744.11
Episode: 15, total_reward: -1201.09
Episode: 16, total_reward: -1309.20
Episode: 17, total_reward: -807.89
Episode: 18, total_reward: -374.59
Episode: 19, total_reward: -1498.48
Episode: 20, total_reward: -1780.03
Episode: 21, total_reward: -629.43
Episode: 22, total_reward: -1070.16
Episode: 23, total_reward: -1068.21
Episode: 24, total_reward: -864.34
Episode: 25, total_reward: -502.48
Episode: 26, total_reward: -985.53
Episode: 27, total_reward: -1805.78
Episode: 28,

In [24]:
x=np.zeros((106,106,3))
r=x[:,:1,0]
r.shape

(106, 1)

In [2]:
import numpy as np