## Q-6 Reinforcement Learning

Implement Reinforcement Learning using an example of a maze environment that the 
agent needs to explore.

In [1]:
import numpy as np

def create_maze():
    rows = int(input("Enter the number of rows: "))
    cols = int(input("Enter the number of columns: "))
    maze = np.zeros((rows, cols), dtype=int)
    print("Enter the maze layout row by row (0 for free space, 1 for wall):")
    for row in range(rows):
        while True:
            try:
                row_data = list(map(int, input().strip().split()))
                if len(row_data) != cols:
                    raise ValueError(f"Expected {cols} columns, but got {len(row_data)}.")
                maze[row] = row_data
                break
            except ValueError as e:
                print(f"Error: {e}. Please enter {cols} integers separated by spaces.")
    return maze

maze = create_maze()

class QLearningAgent:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.2):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.q_table = np.zeros((num_states, num_actions))

    def choose_action(self, state):
        if np.random.rand() < self.exploration_prob:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state):
        predicted = self.q_table[state, action]
        target = reward + self.discount_factor * np.max(self.q_table[next_state])
        self.q_table[state, action] += self.learning_rate * (target - predicted)

# Map 2D maze coordinates to 1D state space
def get_state(row, col, cols):
    return row * cols + col

def get_coordinates(state, cols):
    return divmod(state, cols)

# Initialize agent
num_states = maze.size
num_actions = 4

initial_state = get_state(0, 0, maze.shape[1])
goal_state = get_state(maze.shape[0] - 1, maze.shape[1] - 1, maze.shape[1])

agent = QLearningAgent(num_states, num_actions)

def train_agent(agent, num_episodes=1000):
    for episode in range(num_episodes):
        state = initial_state
        done = False
        while not done:
            action = agent.choose_action(state)
            row, col = get_coordinates(state, maze.shape[1])
            next_state = state
            if action == 0:  # Move Up
                if row > 0:
                    next_row, next_col = row - 1, col
                    next_state = get_state(next_row, next_col, maze.shape[1])
            elif action == 1:  # Move Down
                if row < maze.shape[0] - 1:
                    next_row, next_col = row + 1, col
                    next_state = get_state(next_row, next_col, maze.shape[1])
            elif action == 2:  # Move Left
                if col > 0:
                    next_row, next_col = row, col - 1
                    next_state = get_state(next_row, next_col, maze.shape[1])
            elif action == 3:  # Move Right
                if col < maze.shape[1] - 1:
                    next_row, next_col = row, col + 1
                    next_state = get_state(next_row, next_col, maze.shape[1])

            if (0 <= next_state < num_states) and (maze.flat[next_state] == 0):  # Check if the move is valid
                if next_state == goal_state:
                    reward = 1  # Reached the goal
                    done = True
                else:
                    reward = 0  # Moved to an empty cell
                agent.learn(state, action, reward, next_state)
                state = next_state

train_agent(agent, num_episodes=1000)

def test_agent(agent):
    state = initial_state
    while state != goal_state:
        action = agent.choose_action(state)
        row, col = get_coordinates(state, maze.shape[1])
        print(f"Current State: ({row}, {col}), Chosen Action: {action}")
        if action == 0:
            state = get_state(row - 1, col, maze.shape[1])
        elif action == 1:
            state = get_state(row + 1, col, maze.shape[1])
        elif action == 2:
            state = get_state(row, col - 1, maze.shape[1])
        elif action == 3:
            state = get_state(row, col + 1, maze.shape[1])
        print(f"New State: ({get_coordinates(state, maze.shape[1])})")
    print("Agent reached the goal!")

test_agent(agent)

Enter the number of rows:  3
Enter the number of columns:  3


Enter the maze layout row by row (0 for free space, 1 for wall):


 0 0 0
 1 1 0
 0 0 0


Current State: (0, 0), Chosen Action: 3
New State: ((0, 1))
Current State: (0, 1), Chosen Action: 3
New State: ((0, 2))
Current State: (0, 2), Chosen Action: 1
New State: ((1, 2))
Current State: (1, 2), Chosen Action: 1
New State: ((2, 2))
Agent reached the goal!


In [1]:
import numpy as np

class TicTacToeEnvironment:
    def __init__(self):
        self.state = [0] * 9
        self.is_terminal = False

    def reset(self):
        self.state = [0] * 9
        self.is_terminal = False

    def get_available_moves(self):
        return [i for i, mark in enumerate(self.state) if mark == 0]

    def make_move(self, move, player_mark):
        self.state[move] = player_mark

    def check_win(self, player_mark):
        winning_states = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # columns
            [0, 4, 8], [2, 4, 6]              # diagonals
        ]
        for state_indices in winning_states:
            if all(self.state[i] == player_mark for i in state_indices):
                self.is_terminal = True
                return True
        return False

    def is_draw(self):
        return 0 not in self.state


class QLearningAgent:
    def __init__(self, learning_rate=0.9, discount_factor=0.9, exploration_rate=0.3):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_table = np.zeros((3 ** 9, 9))

    def get_state_index(self, state):
        state_index = 0
        for i, mark in enumerate(state):
            state_index += (3 ** i) * (mark + 1)
        return state_index

    def choose_action(self, state, available_moves):
        state_index = self.get_state_index(state)
        if np.random.random() < self.exploration_rate:
            return np.random.choice(available_moves)
        else:
            return np.argmax(self.q_table[state_index, available_moves])

    def update_q_table(self, state, action, next_state, reward):
        state_index = self.get_state_index(state)
        next_state_index = self.get_state_index(next_state) if next_state is not None else None
        max_q_value = np.max(self.q_table[next_state_index]) if next_state is not None else 0
        self.q_table[state_index, action] = (1 - self.learning_rate) * self.q_table[state_index, action] + \
            self.learning_rate * (reward + self.discount_factor * max_q_value)


def evaluate_agents(agent1, agent2, num_episodes=1000):
    environment = TicTacToeEnvironment()
    agent1_wins = 0
    agent2_wins = 0
    draws = 0

    for _ in range(num_episodes):
        environment.reset()
        current_agent = agent1

        while not environment.is_terminal:
            available_moves = environment.get_available_moves()
            current_state = environment.state.copy()
            action = current_agent.choose_action(current_state, available_moves)
            environment.make_move(action, 1 if current_agent == agent1 else -1)

            if environment.check_win(1 if current_agent == agent1 else -1):
                current_agent.update_q_table(current_state, action, None, 10)
                if current_agent == agent1:
                    agent1_wins += 1
                else:
                    agent2_wins += 1
                break
            elif environment.is_draw():
                current_agent.update_q_table(current_state, action, None, 0)
                draws += 1
                break

            next_state = environment.state.copy()
            reward = 0
            if environment.check_win(1 if current_agent == agent1 else -1):
                reward = -10
            current_agent.update_q_table(current_state, action, next_state, reward)
            current_agent = agent2 if current_agent == agent1 else agent1

    return agent1_wins, agent2_wins, draws


# Create agents
agent1 = QLearningAgent()
agent2 = QLearningAgent()

# Evaluate agents
agent1_wins, agent2_wins, draws = evaluate_agents(agent1, agent2)

# Print results
print(f"Agent 1 wins: {agent1_wins}")
print(f"Agent 2 wins: {agent2_wins}")
print(f"Draws: {draws}")


Agent 1 wins: 453
Agent 2 wins: 486
Draws: 61


In [3]:
pip install numpy tensorflow




In [4]:
import numpy as np

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))  # 3x3 board initialized to zeros
        self.current_player = 1  # Player 1 starts

    def reset(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1

    def make_move(self, row, col):
        if self.board[row, col] == 0:  # Check if the cell is empty
            self.board[row, col] = self.current_player
            self.current_player = 3 - self.current_player  # Switch players
            return True
        return False

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:  # Check rows
                return int(self.board[i, 0])
            if abs(sum(self.board[:, i])) == 3:  # Check columns
                return int(self.board[0, i])
        if abs(sum(self.board.diagonal())) == 3:  # Check diagonal
            return int(self.board[0, 0])
        if abs(sum(np.fliplr(self.board).diagonal())) == 3:  # Check anti-diagonal
            return int(self.board[0, 2])
        if not np.any(self.board == 0):  # Check for draw
            return 0
        return None  # Game ongoing

    def get_available_moves(self):
        return np.argwhere(self.board == 0)

    def render(self):
        print(self.board)


In [5]:
import tensorflow as tf
from tensorflow.keras import layers

class QLearningModel:
    def __init__(self):
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(layers.Input(shape=(9,)))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(9, activation='linear'))  # 9 possible moves
        model.compile(optimizer='adam', loss='mse')
        return model

    def predict(self, state):
        state = state.flatten().reshape(1, -1)
        return self.model.predict(state).flatten()

    def train(self, states, targets):
        self.model.fit(np.array(states), np.array(targets), epochs=10, verbose=0)


In [6]:
import random

def train_agent(episodes):
    model = QLearningModel()
    gamma = 0.9  # Discount factor
    epsilon = 1.0  # Exploration rate
    epsilon_decay = 0.99
    min_epsilon = 0.1
    batch_size = 32

    for episode in range(episodes):
        game = TicTacToe()
        states, targets = [], []
        while True:
            state = game.board.copy()
            if random.random() < epsilon:  # Exploration
                move = random.choice(game.get_available_moves())
            else:  # Exploitation
                q_values = model.predict(state)
                available_moves = game.get_available_moves()
                move = available_moves[np.argmax(q_values[available_moves[:, 0] * 3 + available_moves[:, 1]])]

            game.make_move(move[0], move[1])
            winner = game.check_winner()
            if winner is not None:
                target = 1 if winner == 1 else -1 if winner == 2 else 0
                target_q_values = model.predict(state)
                target_q_values[move[0] * 3 + move[1]] = target
                states.append(state)
                targets.append(target_q_values)
                break
            else:
                states.append(state)
                targets.append(model.predict(state))

        if len(states) >= batch_size:
            model.train(states, targets)

        if epsilon > min_epsilon:
            epsilon *= epsilon_decay

train_agent(10000)  # Adjust the number of episodes as needed


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

KeyboardInterrupt: 

In [None]:
def test_agent(num_games):
    wins = 0
    draws = 0
    for _ in range(num_games):
        game = TicTacToe()
        while True:
            state = game.board.copy()
            q_values = model.predict(state)
            available_moves = game.get_available_moves()
            move = available_moves[np.argmax(q_values[available_moves[:, 0] * 3 + available_moves[:, 1]])]
            game.make_move(move[0], move[1])
            winner = game.check_winner()
            if winner is not None:
                if winner == 1:
                    wins += 1
                elif winner == 0:
                    draws += 1
                break
    print(f"Wins: {wins}, Draws: {draws}")

test_agent(1000)  # Test the agent
