In [4]:
import numpy as np
import random
class MazeEnv:
    def __init__(self, maze_layout, start_state, goal_state):
        self.maze = maze_layout
        self.rows = len(maze_layout)
        self.cols = len(maze_layout[0])
        self.start_state = start_state
        self.goal_state = goal_state
        self.state = start_state

    def reset(self):
        self.state = self.start_state
        return self.state

    def step(self, action):
        new_row = self.state[0] + (action == 0) - (action == 2)  # Up/Down
        new_col = self.state[1] + (action == 1) - (action == 3)  # Right/Left

        # Check bounds
        if 0 <= new_row < self.rows and 0 <= new_col < self.cols:
            if self.maze[new_row][new_col] != 1:  # Not a wall
                self.state = (new_row, new_col)

        reward = -1 # Default move penalty
        done = False
        if self.state == self.goal_state:
            reward = 10 # Reward for reaching the goal
            done = True

        return self.state, reward, done, {} 


In [9]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from collections import deque

class DDQNAgent:
    def __init__(self, num_states, num_actions, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.num_states = num_states
        self.num_actions = num_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.memory = deque(maxlen=2000)

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights()) 

    def _build_model(self):
        model = keras.Sequential([
            Dense(25, activation='relu', input_shape=(2,)),
            Dense(32, activation='relu'),
            Dense(self.num_actions)  
        ])
        model.compile(loss='mse', optimizer='adam')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.num_actions)
        else:
            q_values = self.model.predict(np.expand_dims(state, axis=0))[0] 
            return np.argmax(q_values)

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            q_update = reward 
            if not done:
                # Double DQN update
                next_action = np.argmax(self.model.predict(np.expand_dims(next_state, axis=0))[0])
                q_update += self.gamma * self.target_model.predict(np.expand_dims(next_state, axis=0))[0][next_action]  

            q_values = self.model.predict(np.expand_dims(state, axis=0))
            q_values[0][action] = q_update 
            self.model.fit(np.expand_dims(state, axis=0), q_values, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())


In [None]:
import numpy as np 

# --- Create your maze ---
maze_layout = [
    [0, 0, 1, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 0, 1], 
    [0, 0, 0, 0, 0],
]
start_state = (0, 0)
goal_state = (4, 4)

# --- Initialize classes ---
env = MazeEnv(maze_layout, start_state, goal_state)
num_states = env.rows * env.cols  # Assuming state is just the position
num_actions = 4  # Up, Down, Left, Right
agent = DDQNAgent(num_states, num_actions)

# --- Training Loop ---
num_episodes = 500
batch_size = 32
target_model_update_freq = 10 # Update target model every 10 episodes


for episode in range(num_episodes):
    state = env.reset()
    done = False
    
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state

        # Start replay only if there's enough experience
        if len(agent.memory) >= batch_size: 
            agent.replay(batch_size)

    if episode % target_model_update_freq == 0:
        agent.update_target_model()

    print(f"Episode {episode} finished")


# Note: You might want a way to visualize the agent's progress 
















In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, layers
import random
from collections import deque
import matplotlib.pyplot as plt

class CustomMazeEnv:
    def __init__(self, size=10):
        self.size = size
        self.state = (0, 0)
        self.goal = (size-1, size-1)
        self.actions = [(0, 1), (1, 0), (0, -1), (-1, 0)]  # right, down, left, up

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        next_state = (self.state[0] + self.actions[action][0], self.state[1] + self.actions[action][1])
        if 0 <= next_state[0] < self.size and 0 <= next_state[1] < self.size:
            self.state = next_state
        reward = 1 if self.state == self.goal else -0.1
        done = self.state == self.goal
        return self.state, reward, done, {}

    def render(self):
        maze = np.zeros((self.size, self.size))
        maze[self.goal] = 0.6
        maze[self.state] = 0.3
        plt.imshow(maze)
        plt.show()

class DQNNetwork(Model):
    def __init__(self, action_size):
        super(DQNNetwork, self).__init__()
        self.fc1 = layers.Dense(24, activation='relu')
        self.fc2 = layers.Dense(24, activation='relu')
        self.fc3 = layers.Dense(action_size, activation='linear')

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        return self.fc3(x)

class DDQN:
    def __init__(self, env):
        self.state_size = 2
        self.action_size = 4
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQNNetwork(self.action_size)
        self.target_model = DQNNetwork(self.action_size)
        self.update_target_model()
        self.optimizer = tf.optimizers.Adam(learning_rate=self.learning_rate)
        self.env = env

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = np.array([state], dtype=np.float32)
        act_values = self.model(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = np.array([next_state], dtype=np.float32)
                target = reward + self.gamma * np.amax(self.target_model(next_state)[0])
            state = np.array([state], dtype=np.float32)
            with tf.GradientTape() as tape:
                predicted = self.model(state)
                target_f = predicted.numpy()
                target_f[0][action] = target
                loss = tf.reduce_mean(tf.square(target_f - predicted))
            grads = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def train(self, episodes=1000, batch_size=32):
        for e in range(episodes):
            state = self.env.reset()
            done = False
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                self.remember(state, action, reward, next_state, done)
                state = next_state
            if len(self.memory) > batch_size:
                self.replay(batch_size)
            self.update_target_model()
            print(f"Episode: {e+1}/{episodes}, Epsilon: {self.epsilon:.2}")
        self.model.save('ddqn_model.h5')

if __name__ == "__main__":
    env = CustomMazeEnv(size=10)
    agent = DDQN(env)
    agent.train(episodes=500)

    # Testing
    done = False
    state = env.reset()
    env.render()
    while not done:
        action = agent.act(state)
        next_state, _, done, _ = env.step(action)
        env.render()
        state = next_state


Episode: 1/500, Epsilon: 0.99
Episode: 2/500, Epsilon: 0.99
Episode: 3/500, Epsilon: 0.99
Episode: 4/500, Epsilon: 0.98
Episode: 5/500, Epsilon: 0.98
Episode: 6/500, Epsilon: 0.97
Episode: 7/500, Epsilon: 0.97
Episode: 8/500, Epsilon: 0.96
Episode: 9/500, Epsilon: 0.96
Episode: 10/500, Epsilon: 0.95
Episode: 11/500, Epsilon: 0.95
Episode: 12/500, Epsilon: 0.94
Episode: 13/500, Epsilon: 0.94
Episode: 14/500, Epsilon: 0.93
Episode: 15/500, Epsilon: 0.93
Episode: 16/500, Epsilon: 0.92
Episode: 17/500, Epsilon: 0.92
Episode: 18/500, Epsilon: 0.91
Episode: 19/500, Epsilon: 0.91
Episode: 20/500, Epsilon: 0.9
Episode: 21/500, Epsilon: 0.9
Episode: 22/500, Epsilon: 0.9
Episode: 23/500, Epsilon: 0.89
Episode: 24/500, Epsilon: 0.89
Episode: 25/500, Epsilon: 0.88
Episode: 26/500, Epsilon: 0.88
Episode: 27/500, Epsilon: 0.87
Episode: 28/500, Epsilon: 0.87
Episode: 29/500, Epsilon: 0.86
Episode: 30/500, Epsilon: 0.86
Episode: 31/500, Epsilon: 0.86
