In [1]:
import numpy as np
import pandas as pd
import random

# Step 1: Create or load a dataset (simplified for this example)
data = {
    "State": ["S1", "S2", "S3", "S4", "S5"],
    "Action1_Reward": [10, -10, 5, 15, 20],
    "Action2_Reward": [5, 20, -5, 10, 25],
}
df = pd.DataFrame(data)

# Step 2: Define environment and Q-learning parameters
states = df["State"].tolist()  # States from the dataset
actions = ["Action1", "Action2"]  # Possible actions
q_table = np.zeros((len(states), len(actions)))  # Initialize Q-table

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.8  # Exploration rate
episodes = 500

# Step 3: Define the RL loop
for episode in range(episodes):
    state = random.choice(states)  # Start from a random state
    state_index = states.index(state)

    for step in range(10):  # Simulate agent steps
        # Choose action: epsilon-greedy
        if random.uniform(0, 1) < epsilon:
            action_index = random.randint(0, len(actions) - 1)  # Explore
        else:
            action_index = np.argmax(q_table[state_index])  # Exploit

        action = actions[action_index]

        # Get reward for chosen action from dataset
        reward = df.iloc[state_index][f"{action}_Reward"]

        # Simulate the next state (stay in the same state here)
        next_state_index = state_index  # (Static environment in this example)

        # Q-Learning update rule
        q_table[state_index, action_index] = q_table[state_index, action_index] + learning_rate * (
            reward + discount_factor * np.max(q_table[next_state_index]) - q_table[state_index, action_index]
        )

# Step 4: Evaluate the Q-table
print("Trained Q-Table:")
print(pd.DataFrame(q_table, index=states, columns=actions))

# Step 5: Use the Q-table for decision-making
def choose_best_action(state):
    state_index = states.index(state)
    action_index = np.argmax(q_table[state_index])
    return actions[action_index]

# Test the agent
for state in states:
    print(f"In state {state}, the agent chooses: {choose_best_action(state)}")


Trained Q-Table:
       Action1     Action2
S1   99.845591   94.841207
S2  169.211223  199.258974
S3   49.805196   39.801806
S4  149.607341  144.562592
S5  244.419032  249.441049
In state S1, the agent chooses: Action1
In state S2, the agent chooses: Action2
In state S3, the agent chooses: Action1
In state S4, the agent chooses: Action1
In state S5, the agent chooses: Action2


In [2]:
pip install gym


Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
     - ----------------------------------- 20.5/721.7 kB 131.3 kB/s eta 0:00:06
     -- ---------------------------------- 41.0/721.7 kB 217.9 kB/s eta 0:00:04
     --- --------------------------------- 61.4/721.7 kB 328.2 kB/s eta 0:00:03
     -------- --------------------------- 163.8/721.7 kB 756.6 kB/s eta 0:00:01
     ---------------------------- --------- 532.5/721.7 kB 2.0 MB/s eta 0:00:01
     -------------------------------------- 721.7/721.7 kB 2.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pypro


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import gym
import numpy as np

# Initialize the Taxi environment
env = gym.make("Taxi-v3")

# Initialize the Q-table with zeros. It will have one row for each state and one column for each action.
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# RL Training loop
for episode in range(5000):
    state, _ = env.reset()  # Correctly unpack the state from env.reset()
    done = False

    while not done:
        # Select an action using epsilon-greedy policy
        action = np.argmax(q_table[state]) if np.random.uniform(0, 1) > 0.1 else env.action_space.sample()

        # Take the action and observe the next state and reward
        next_state, reward, done, _, _ = env.step(action)  # Unpack the returned tuple correctly

        # Update the Q-table using the Q-learning formula
        q_table[state, action] += 0.1 * (reward + 0.9 * np.max(q_table[next_state]) - q_table[state, action])

        # Move to the next state
        state = next_state

# After training, you can test the policy
state = env.reset()[0]
done = False
while not done:
    action = np.argmax(q_table[state])  # Select the best action based on the Q-table
    state, reward, done, _, _ = env.step(action)  # Take the action in the environment
    env.render()  # Render the environment to see the results


AttributeError: module 'numpy' has no attribute 'bool8'

In [5]:
pip install --upgrade numpy


Collecting numpy
  Downloading numpy-2.1.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 217.9 kB/s eta 0:00:01
     ------------------- ------------------ 30.7/60.8 kB 330.3 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/60.8 kB 292.6 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 294.2 kB/s eta 0:00:00
Downloading numpy-2.1.3-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------------------------------- 0.1/12.6 MB 3.4 MB/s eta 0:00:04
   ---------------------------------------- 0.2/12.6 MB 3.1 MB/s eta 0:00:05
   - -------------------------------------- 0.4/12.6 MB 3.4 MB/s eta 0:00:04
   --- ------------------------------------ 1.0/12.6 MB 6.5 MB/s eta 0:00:02
   ----- --

  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
