# Deep Q-Learning Network From Scratch

Algorithm: 
![dqn](https://i.imgur.com/uevfmj2.png)

In [1]:
import numpy as np
import gym
import random
from tqdm import tqdm

In [2]:
import pygame

def get_surface(rgb_array):
    surface = pygame.surfarray.make_surface(np.transpose(rgb_array, (1, 0, 2)))
    return surface
# utility function to view how our agent plays the cartpole, using pygame
# After done; this function will print the score (total reward)
def play(net,env):
    pygame.init()
    screen = pygame.display.set_mode((600,400))
    pygame.display.set_caption('CartPole')   

    state,_ = env.reset()
    done = False
    rewards = 0
    while not done:
        action = np.argmax(net.predict_single(state))
        state, r, done, _,_ = env.step(action)
        rewards += r
        surface = get_surface(env.render())

        screen.blit(surface, (0, 0))
        pygame.display.flip()
        
        for event in pygame.event.get():
            if event.type == pygame.QUIT: done = True
    
    print(rewards)
    pygame.quit()


## Implementation

In [3]:
# If enough data in replay memory, learn with that
def learn(data,batch_size):
    D.append(data)
    if len(D)<batch_size: return
    
    minibatch = random.sample(D,batch_size)
    X = np.zeros((batch_size, state_shape))
    y = np.zeros((batch_size, action_size))
    for i, (state, action, reward, nxt_state, done) in enumerate(minibatch):
        X[i] = state
        y_i = reward + (1-done) * gamma * np.max(Q_target.predict_single(nxt_state))
        y[i] = Q.predict_single(state)
        y[i][action] = y_i 
        # for Q(s_i,a_i) - y_i we let our network to compute Q(s_i,a_i), 
        # so every index except the action became zero:- [0,si_ai-y_i,0,0])**2
        
    Q.train_on_batch(X, y,epoch=1)
    

In [4]:
# Update Q' with Q weights
def update_target(target):
    for i in range(Q.L):
        W1,b1 = Q.NN[i]
        W2,b2 = target[i]
        W2[:] = W1[:]
        b2[:] = b1[:]
        

In [5]:
# Train for N episodes
def train(num_episode=100,batch_size=32,C=10,ep=10):
    global epsilon,best_score
    steps = 0
    for i in tqdm(range(1,num_episode+1)):
        episode_reward = 0
        episode_loss = 0

        # Sample Phase
        done = False
        nxt_state,_ = env.reset()
        while not done:
            state = nxt_state
            epsilon = min(epsilon_min,epsilon*epsilon_decay) # e decay

            # e-greedy(Q)
            if np.random.randn() < epsilon: action = np.random.randint(action_size)
            else:
                q_vals = Q.predict_single(state)
                action = np.argmax(q_vals)

            nxt_state,reward,done,_,_ = env.step(action)
            episode_reward += reward
            
            # Learining Phase
            learn((state,action,reward,nxt_state,done),batch_size)
            steps+=1
            
            if steps%C ==0: update_target(Q_target.NN)
        if episode_reward > best_score:
            best_score = episode_reward
        if i%ep==0: print(f"Episode: {i} Reward: {episode_reward}, Epsilon{epsilon}")


# Lets train our agent to play Cartpole

In [6]:
from nn import NeuralNetwork
import copy
from collections import deque

In [7]:
env = gym.make('CartPole-v1',render_mode= "rgb_array")

In [8]:
state, _ = env.reset()
len(state),env.action_space

(4, Discrete(2))

In [9]:
arch = [4,4,3,2]
af = ["sigmoid","relu","linear"]

In [10]:
# Q
Q = NeuralNetwork(arch,af,eta=5e-4,momentum=0,seed=8)

# Q'
Q_target = copy.deepcopy(Q) #Q' NeuralNetwork(same parms as above) then update_target(Q_target.NN) will also work

# Replay Memory
D = deque(maxlen=10000) # if D==maxlen and we append new data oldest one will get removed

action_size = 2 # Action Space
state_shape = 4 # State Size

# Epsilon
epsilon = 0.1
epsilon_min = 0.01
epsilon_decay = 0.995

# Gamma
gamma = 0.95
# Just to check the highest score obtained during training
best_score = -np.inf

In [11]:
train(1000,42,ep=100)

  if not isinstance(terminated, (bool, np.bool8)):
 11%|████████▎                                                                      | 106/1000 [00:03<00:27, 32.09it/s]

Episode: 100 Reward: 17.0, Epsilon1.2225933740973863e-05


 20%|███████████████▉                                                               | 201/1000 [00:09<02:07,  6.28it/s]

Episode: 200 Reward: 17.0, Epsilon1.5637104873087343e-08


 30%|███████████████████████▋                                                       | 300/1000 [00:26<02:08,  5.47it/s]

Episode: 300 Reward: 9.0, Epsilon2.5440332556796825e-11


 40%|███████████████████████████████▌                                               | 400/1000 [00:49<02:08,  4.66it/s]

Episode: 400 Reward: 14.0, Epsilon1.8100790524860866e-14


 50%|███████████████████████████████████████▌                                       | 501/1000 [01:12<01:38,  5.04it/s]

Episode: 500 Reward: 16.0, Epsilon1.629999629591796e-17


 60%|███████████████████████████████████████████████▍                               | 601/1000 [01:28<00:58,  6.85it/s]

Episode: 600 Reward: 26.0, Epsilon2.0639918509307174e-20


 70%|███████████████████████████████████████████████████████▎                       | 700/1000 [01:44<00:51,  5.81it/s]

Episode: 700 Reward: 10.0, Epsilon2.6664664756378626e-23


 80%|███████████████████████████████████████████████████████████████▏               | 800/1000 [02:03<00:35,  5.64it/s]

Episode: 800 Reward: 9.0, Epsilon2.2723748779428145e-26


 90%|███████████████████████████████████████████████████████████████████████▏       | 901/1000 [02:22<00:18,  5.25it/s]

Episode: 900 Reward: 14.0, Epsilon2.463292332391373e-29


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:02<00:00,  5.49it/s]

Episode: 1000 Reward: 12.0, Epsilon9.122259356643529e-35





In [12]:
best_score

89.0

In [13]:
play(Q,env)

37.0


## Saving The Network

In [14]:
from saveload import save_network,load_network

In [15]:
save_network(Q,"CartPoleScratchNet500") 
save_network(Q,"CartPoleScratchNetBetter")

## Loading Pretrained Network

when loading a network, please move that network out of the `networks` folder.

#### CartPoleScratchNet500
- Avg score 500 
- HyperParameters `(eta=5e-3,momentum=0.3,num_episodes=300,batch=42,C=10)`
#### CartPoleScratchNetBetter
- Avg score range `500<score<5000` (one time it got 6283)
- HyperParameters `(eta=5e-4,momentum=0,num_episodes=2000,batch=42,C=10)`


In [16]:
cpsn500 = load_network("CartPoleScratchNet500")

In [17]:
play(cpsn500,env)

33.0


In [18]:
cpsnb = load_network("CartPoleScratchNetBetter")

In [19]:
play(cpsnb,env)

34.0
