In [None]:
import numpy as np
import gym
import matplotlib.pyplot as plt

In [None]:
class DamGateControlEnv(gym.Env):
    def __init__(self, num_gates=1, num_levels=8):
        super(DamGateControlEnv, self).__init__()
        self.num_gates = num_gates
        self.num_levels = num_levels
        self.observation_space = gym.spaces.Discrete(num_levels * num_gates)
        self.action_space = gym.spaces.Discrete(num_levels)
        self.state = np.zeros(num_gates)

    def reset(self):
        self.state = np.zeros(self.num_gates)
        return self.state

    def step(self, action):
        # Update the gate positions based on the action
        self.state += action - (self.num_levels // 2)
        self.state = np.clip(self.state, 0, self.num_levels - 1)

        # Calculate the reward (you may need to customize this based on your problem)
        reward = -np.sum(np.abs(self.state - (self.num_levels // 2)))

        # Check if the episode is done (you may need to customize this based on your problem)
        done = False

        return self.state, reward, done, {}



# Q-learning algorithm

In [None]:
def q_learning(env, num_episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_history = []
    q_value_history = []  # New list to store Q-values for a specific state-action pair

    for episode in range(num_episodes):
        state = env.reset()
        state = int(state.item()) if hasattr(state, 'item') else int(state)
        done = False
        total_reward = 0

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()  # Exploration
            else:
                action = np.argmax(q_table[state])  # Exploitation

            next_state, reward, done, _ = env.step(action)
            next_state = int(next_state.item()) if hasattr(next_state, 'item') else int(next_state)

            # Q-value update
            q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])

            state = next_state
            total_reward += reward

        rewards_history.append(total_reward)

    # Plot the rewards and Q-values
    plt.figure(figsize=(12, 6))

    # Plot rewards
    plt.subplot(1, 2, 1)
    plt.plot(rewards_history)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Training Progress')

    # Plot Q-values for the specified state-action pair
    plt.subplot(1, 2, 2)
    plt.plot(q_value_history)
    plt.xlabel('Episode')
    plt.ylabel('Q-value')
    plt.title(f'Q-values for State-Action Pair {plot_state}')

    plt.tight_layout()
    plt.show()

    return q_table, rewards_history

In [None]:
# Function to visualize Q-values for a specific state-action pair
def visualize_q_values(q_table, state_action_pair):
    plt.plot(q_table[state_action_pair[0], state_action_pair[1]])
    plt.xlabel('Episode')
    plt.ylabel('Q-value')
    plt.title(f'Q-values for State-Action Pair {state_action_pair}')
    plt.show()

# Training the agent

In [None]:
env = DamGateControlEnv()

In [None]:
trained_q_table, rewards_history = q_learning(env)

In [None]:
visualize_q_values(trained_q_table, (0, 0))

# Testing the trained agent

In [None]:
state = env.reset()
state = int(state.item()) if hasattr(state, 'item') else int(state)
done = False

In [None]:
while not done:
    action = np.argmax(trained_q_table[state])
    next_state, reward, done, _ = env.step(action)
    next_state = int(next_state.item()) if hasattr(next_state, 'item') else int(next_state)

    print(f"Current State: {state}, Action: {action}, Next State: {next_state}, Reward: {reward}")

    state = next_state