In [None]:
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box
import random
import matplotlib.pyplot as plt


In [None]:
import time

def change_variable_every_second(x):
    variable_to_change = 0

    while True:
        print("Current value:", variable_to_change)
        variable_to_change += x
        time.sleep(1)


In [None]:
class DamGateControlEnv(Env):
    def __init__(self, num_levels=16):
        self.num_levels = num_levels
        self.total_inflow = random.uniform(0, 1000000)
        self.observation_space = Box(low=np.array([0.0]), high=np.array([500.0]))
        self.action_space = Discrete(num_levels)
        self.state = random.uniform(436, 438.65)

    def step(self, action):
        for second in range(60):
            self.state += self.total_inflow / 23.70000076 * 1000000
            self.state -= action * 150000 / 23.70000076 * 1000000
            self.state = np.clip(self.state, 0, self.num_levels - 1)

            if 438 < self.state <= 438.65:
                reward = 1
            else:
                reward = -1

            # Check if the episode is done
        done = self.state > 438.65

        # Add some random noise to the state
        self.state += random.uniform(-0.02, 0.02)
        
        info = {}
        return self.state, reward, done, info

    def render(self):
        print(f"Current Water Level: {self.state:}")

    def reset(self):
        self.state = random.uniform(400, 500)
        self.total_inflow = random.uniform(-1000000, 1000000)
        return self.state


In [None]:
env = DamGateControlEnv()

In [None]:
env.observation_space.sample()

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.observation_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print("Episode:{} Score:{}".format(episode, score))

# Q-learning algorithm

In [None]:
def q_learning(env, num_episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1):
    q_table = np.zeros((env.observation_space.shape[0], env.action_space.n))
    rewards_history = []
    q_value_history = []  # New list to store Q-values for a specific state-action pair

    for episode in range(num_episodes):
        state = env.reset()
        state = int(state.item()) if hasattr(state, 'item') else int(state)
        done = False
        total_reward = 0

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()  # Exploration
            else:
                action = np.argmax(q_table[state])  # Exploitation

            next_state, reward, done, _ = env.step(action)
            next_state = int(next_state.item()) if hasattr(next_state, 'item') else int(next_state)

            # Q-value update
            q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])

            state = next_state
            total_reward += reward

        rewards_history.append(total_reward)

    # Plot the rewards and Q-values
    plt.figure(figsize=(12, 6))

    # Plot rewards
    plt.subplot(1, 2, 1)
    plt.plot(rewards_history)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Training Progress')

    # Plot Q-values for the specified state-action pair
    plt.subplot(1, 2, 2)
    plt.plot(q_value_history)
    plt.xlabel('Episode')
    plt.ylabel('Q-value')
    plt.title(f'Q-values for State-Action Pair {plot_state}')

    plt.tight_layout()
    plt.show()

    return q_table, rewards_history

In [None]:
# Function to visualize Q-values for a specific state-action pair
def visualize_q_values(q_table, state_action_pair):
    plt.plot(q_table[state_action_pair[0], state_action_pair[1]])
    plt.xlabel('Episode')
    plt.ylabel('Q-value')
    plt.title(f'Q-values for State-Action Pair {state_action_pair}')
    plt.show()

# Training the agent

In [None]:
env = DamGateControlEnv()

In [None]:
trained_q_table, rewards_history = q_learning(env)

In [None]:
visualize_q_values(trained_q_table, (0, 0))

# Testing the trained agent

In [None]:
state = env.reset()
state = int(state.item()) if hasattr(state, 'item') else int(state)
done = False

In [None]:
while not done:
    action = np.argmax(trained_q_table[state])
    next_state, reward, done, _ = env.step(action)
    next_state = int(next_state.item()) if hasattr(next_state, 'item') else int(next_state)

    print(f"Current State: {state}, Action: {action}, Next State: {next_state}, Reward: {reward}")

    state = next_state