In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

class MazeEnv:
    def __init__(self, size=5, obstacles=2, goal=(4, 4)):
        self.size = size
        self.obstacles = obstacles
        self.goal = goal
        self.state = None
        self.reset()

    def reset(self):
        self.state = (0, 0)
        self.maze = np.zeros((self.size, self.size))
        self._add_obstacles()
        self.maze[self.goal] = 0.5  # Mark goal on the maze
        return self.state

    def _add_obstacles(self):
        for _ in range(self.obstacles):
            obs = (np.random.randint(self.size), np.random.randint(self.size))
            while obs == self.goal or obs == (0, 0):
                obs = (np.random.randint(self.size), np.random.randint(self.size))
            self.maze[obs] = -1

    def step(self, action):
        moves = [(0, -1), (0, 1), (-1, 0), (1, 0)]
        next_state = tuple(np.add(self.state, moves[action]))

        if 0 <= next_state[0] < self.size and 0 <= next_state[1] < self.size:
            if self.maze[next_state] != -1:  # Check for obstacles
                self.state = next_state

        done = self.state == self.goal
        reward = 1 if done else -0.1
        return self.state, reward, done, {}

    def render(self):
        maze_copy = self.maze.copy()
        maze_copy[self.state] = 0.75  # Mark agent on the maze
        print(maze_copy)


class DuelingDQN:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.model = self._build_model()

    def _build_model(self):
        inputs = layers.Input(shape=(self.state_size,))
        fc1 = layers.Dense(24, activation='relu')(inputs)
        fc2 = layers.Dense(24, activation='relu')(fc1)

        # Dueling Networks
        fc_value = layers.Dense(24, activation='relu')(fc2)
        value = layers.Dense(1, activation='linear')(fc_value)

        fc_advantages = layers.Dense(24, activation='relu')(fc2)
        advantages = layers.Dense(self.action_size, activation='linear')(fc_advantages)

        output = value + (advantages - tf.reduce_mean(advantages, axis=1, keepdims=True))

        model = models.Model(inputs=inputs, outputs=output)
        model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss='mse')
        return model

    def train(self, state, action, reward, next_state, done):
        target = self.model.predict(state)
        if done:
            target[0][action] = reward
        else:
            Q_future = max(self.model.predict(next_state)[0])
            target[0][action] = reward + Q_future * 0.95

        self.model.fit(state, target, epochs=1, verbose=0)

    def act(self, state):
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])


if __name__ == "__main__":
    env = MazeEnv(size=5, obstacles=2)
    state_size = 2
    action_size = 4
    episodes = 1000

    agent = DuelingDQN(state_size, action_size)

    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        done = False

        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            agent.train(state, action, reward, next_state, done)
            state = next_state

        if (e + 1) % 100 == 0:
            print(f"Episode: {e + 1}/{episodes}")

    print("Training finished.")




In [8]:
import numpy as np

class MazeEnv:
    def __init__(self, maze_layout, start_state, goal_state):
        self.maze = maze_layout
        self.rows = len(maze_layout)
        self.cols = len(maze_layout[0])
        self.start_state = start_state
        self.goal_state = goal_state
        self.state = start_state

    def reset(self):
        self.state = self.start_state
        return self.state

    def step(self, action):
        new_row = self.state[0] + (action == 0) - (action == 2)  # Up/Down
        new_col = self.state[1] + (action == 1) - (action == 3)  # Right/Left

        # Check bounds
        if 0 <= new_row < self.rows and 0 <= new_col < self.cols:
            if self.maze[new_row][new_col] != 1:  # Not a wall
                self.state = (new_row, new_col)

        reward = -1 # Default move penalty
        done = False
        if self.state == self.goal_state:
            reward = 10 # Reward for reaching the goal
            done = True

        return self.state, reward, done, {}
    
    def render(self):
        maze_copy = np.copy(self.maze)
        maze_copy[self.state[0]][self.state[1]] = 'A'  # Mark the agent's position
        for row in maze_copy:
            print(" ".join(['_' if x == 0 else 'X' for x in row]))

    
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, concatenate
from collections import deque

class DuelingDQNAgent:
    def __init__(self, num_states, num_actions, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.num_states = num_states
        self.num_actions = num_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.memory = deque(maxlen=2000)

        self.model = self._build_model()
        

    def _build_model(self):
        inputs = Input(shape=(2,))  # Update this line
        hidden = Dense(32, activation='relu')(inputs)
        value = Dense(1)(hidden) # State Value
        advantage = Dense(self.num_actions)(hidden) # Advantage for each action

        # Normalization 
        advantage = advantage - tf.reduce_mean(advantage, axis=1, keepdims=True) 

        output = value + advantage 

        model = keras.Model(inputs=inputs, outputs=output)
        model.compile(loss='mse', optimizer='adam')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.num_actions)
        else:
            q_values = self.model.predict(np.expand_dims(state, axis=0))[0] 
            return np.argmax(q_values)

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            q_update = reward 
            if not done:
                q_update += self.gamma * np.amax(self.model.predict(np.expand_dims(next_state, axis=0))[0]) 

            q_values = self.model.predict(np.expand_dims(state, axis=0))
            q_values[0][action] = q_update 
            self.model.fit(np.expand_dims(state, axis=0), q_values, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
import numpy as np 
import random
# --- Create your maze ---
maze_layout = [
    [0, 0, 1, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 0, 1], 
    [0, 0, 0, 0, 0],
]
start_state = (0, 0)
goal_state = (4, 4)

# --- Initialize classes ---
env = MazeEnv(maze_layout, start_state, goal_state)
num_states = env.rows * env.cols  
num_actions = 4  
agent = DuelingDQNAgent(num_states, num_actions)

# --- Training Loop ---
num_episodes = 500
batch_size = 32

for episode in range(num_episodes):
    state = env.reset()
    done = False
    
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state

        # Start replay only if there's enough experience
        if len(agent.memory) >= batch_size: 
            agent.replay(batch_size)

    print(f"Episode {episode} finished")





