<a href="https://colab.research.google.com/github/Armin-Abdollahi/Machine-Learning/blob/main/Deep_Q_Networks_(DQN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Deep Q-Networks (DQN)

Deep Q-Networks (DQN) is a reinforcement learning algorithm that combines Q-Learning with deep neural networks. It uses a neural network to approximate the Q-values for each action in a given state, allowing it to handle environments with high-dimensional and continuous state spaces. DQN uses experience replay (storing past experiences and training on random batches) and a target network to stabilize training.

Here’s a basic implementation using TensorFlow and Gym for a simple environment:

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import gym
import random
from collections import deque

# Set up the environment
env = gym.make("CartPole-v1")
num_actions = env.action_space.n
state_shape = env.observation_space.shape

# Parameters
gamma = 0.99 # Discount factor
epsilon = 1.0 # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 32
memory = deque(maxlen=2000)

# Neural network model
def build_model():
  model = tf.keras.Sequential([
      layers.Dense(24, activation='relu', input_shape=state_shape),
      layers.Dense(24, activation='relu'),
      layers.Dense(num_actions, activation='linear')
  ])
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')
  return model

model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

# Training function
def train_model():
  if len(memory) < batch_size:
    return
  batch = random.sample(memory, batch_size)
  for state, action, reward, next_state, done in batch:
    target = reward
    if not done:
      target += gamma * np.amax(target_model.predict(next_state) [0])
    target_f = model.predict(state)
    target_f [0] [action] = target
    model.fit(state, target_f, epochs=1, verbose=0)

# DQN algorithm
num_episodes = 1000
for episode in range(num_episodes):
  state = env.reset()
  state = np.reshape(state, [1, state_shape[0]])
  for time in range(500):
    if np.random.rand() <= epsilon:
      action = random.randrange(num_actions) # Exploration
    else:
      action = np.argmax(model.predict(state)[0]) # Exploitation
    next_state, reward, done, _ = env.step(action)
    reward = reward if not done else -10
    next_state = np.reshape(next_state, [1, state_shape[0]])
    memory.append((state, action, reward, next_state, done))
    state = next_state
    if done:
      target_model.set_weights(model.get_weights()) # Update target network
      epsilon = max(epsilon_min, epsilon * epsilon_decay) # Decay epsilon
      print(f"Episode: {episode + 1}/{num_episodes}, Score: {time}, Epsilon: {epsilon:.2}")
      break
    train_model

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Episode: 563/1000, Score: 9, Epsilon: 0.1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━