In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

In [11]:
class PairedSlotMachineEnv(gym.Env):
    def __init__(self):
        super(PairedSlotMachineEnv, self).__init__()

        # Define action space: pick option 1 or option 2 in the current pair
        self.action_space = spaces.Discrete(2)
        
        # Observation space: represent each machine with its index and indicator status (0 or 1)
        self.observation_space = spaces.MultiDiscrete([8, 2, 8, 2])

        # Define slot machine points and probabilities
        self.points = {
            'A': 10, 'B': 10, 'C': 1, 'D': 1,
            'E': 10, 'F': 10, 'G': 1, 'H': 1
        }
        self.probabilities = {
            'A': 0.75, 'B': 0.25, 'C': 0.75, 'D': 0.25,
            'E': 0.75, 'F': 0.25, 'G': 0.75, 'H': 0.25
        }

        # Original pairs for Learning Phase and new pairs for Transfer Phase
        self.learning_pairs = [('A', 'B'), ('C', 'D'), ('E', 'F'), ('G', 'H')]
        self.transfer_pairs = [('A', 'C'), ('B', 'D'), ('E', 'H'), ('F', 'G')]
        self.transfer_phase = False  # Start in learning phase by default
        self.current_pair = None
        self.rewards = None
        self.indicators = None

    def set_transfer_phase(self, transfer: bool):
        """Toggle between learning and transfer phases."""
        self.transfer_phase = transfer

    def reset(self, seed=None):
        super().reset(seed=seed)
        # Start with a random pair from the current phase
        self.current_pair = self._get_random_pair()
        # Set up the initial observation with pre-sampled rewards and indicators
        return self._get_observation(), {}

    def _get_machine_index(self, machine):
        # Map machine labels to indices for the observation space
        machine_indices = {'A': 0, 'B': 1, 'C': 2, 'D': 3,
                           'E': 4, 'F': 5, 'G': 6, 'H': 7}
        return machine_indices[machine]

    def _get_random_pair(self):
        # Select from learning pairs or transfer pairs based on the phase
        pairs = self.transfer_pairs if self.transfer_phase else self.learning_pairs
        pair = pairs[np.random.randint(len(pairs))]
        # Randomly shuffle the order with 50% probability
        if np.random.rand() < 0.5:
            pair = pair[::-1]
        return pair

    def _get_indicator_status(self, machine, reward_given):
        # 95% chance of showing the indicator if reward is given, or hiding it if not
        if reward_given:
            return 1 if np.random.rand() < 0.95 else 0
        else:
            return 0 if np.random.rand() < 0.95 else 1

    def _get_observation(self):
        # Generate machine indices for the current pair
        machine_indices = [self._get_machine_index(m) for m in self.current_pair]

        # Sample rewards and indicator statuses once, store them in instance variables
        self.rewards = []
        self.indicators = []
        
        for i, machine in enumerate(self.current_pair):
            reward_given = np.random.rand() < self.probabilities[machine]
            reward = self.points[machine] if reward_given else 0
            self.rewards.append(reward)
            indicator = self._get_indicator_status(machine, reward_given)
            self.indicators.append(indicator)

        # Return observation as [machine index, indicator] pairs
        return np.array([machine_indices[0], self.indicators[0], machine_indices[1], self.indicators[1]])

    def step(self, action):
        # Use the pre-sampled reward for the selected action
        reward = self.rewards[action]
        
        # Select a new random pair for the next step
        self.current_pair = self._get_random_pair()
        
        # Get the new observation (with updated pre-sampled rewards and indicators)
        observation = self._get_observation()
        return observation, reward, False, False, {}

    def render(self):
        print(f"Current pair: {self.current_pair[0]} (with indicator {self.indicators[0]}) "
              f"vs {self.current_pair[1]} (with indicator {self.indicators[1]})")


In [12]:
# Initialize the environment
env = PairedSlotMachineEnv()
check_env(env)

# Create the PPO model
model = PPO("MlpPolicy", env, verbose=1, gamma=1)

# Train the model
model.learn(total_timesteps=100000)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 371  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 313         |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010977835 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00278     |
|    learning_rate        | 0.0003      |
|    loss                 | 987         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0182     |
|    value_loss        

<stable_baselines3.ppo.ppo.PPO at 0x22ea80cb350>

In [13]:
obs, _ = env.reset()
for _ in range(50):
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, _, info = env.step(action)
    print(f"Action taken: {action}, Reward received: {reward}")


Current pair: E (with indicator 1) vs F (with indicator 0)
Action taken: 0, Reward received: 10
Current pair: C (with indicator 1) vs D (with indicator 1)
Action taken: 1, Reward received: 1
Current pair: H (with indicator 0) vs G (with indicator 1)
Action taken: 1, Reward received: 1
Current pair: F (with indicator 1) vs E (with indicator 1)
Action taken: 1, Reward received: 10
Current pair: A (with indicator 1) vs B (with indicator 0)
Action taken: 0, Reward received: 10
Current pair: E (with indicator 1) vs F (with indicator 0)
Action taken: 0, Reward received: 10
Current pair: F (with indicator 0) vs E (with indicator 0)
Action taken: 1, Reward received: 0
Current pair: H (with indicator 0) vs G (with indicator 0)
Action taken: 0, Reward received: 0
Current pair: E (with indicator 1) vs F (with indicator 0)
Action taken: 0, Reward received: 10
Current pair: E (with indicator 0) vs F (with indicator 0)
Action taken: 0, Reward received: 0
Current pair: G (with indicator 1) vs H (with

In [14]:
# Switch to Transfer Phase
env.set_transfer_phase(True)
obs, _ = env.reset()

# Test agent in Transfer Phase
for _ in range(20):
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, _, info = env.step(action)
    print(f"Action taken: {action}, Reward received: {reward}")

Current pair: G (with indicator 1) vs F (with indicator 0)
Action taken: 0, Reward received: 1
Current pair: B (with indicator 0) vs D (with indicator 1)
Action taken: 1, Reward received: 1
Current pair: B (with indicator 0) vs D (with indicator 0)
Action taken: 1, Reward received: 0
Current pair: B (with indicator 0) vs D (with indicator 1)
Action taken: 1, Reward received: 1
Current pair: G (with indicator 1) vs F (with indicator 0)
Action taken: 0, Reward received: 1
Current pair: B (with indicator 0) vs D (with indicator 1)
Action taken: 1, Reward received: 1
Current pair: F (with indicator 0) vs G (with indicator 1)
Action taken: 1, Reward received: 1
Current pair: G (with indicator 1) vs F (with indicator 0)
Action taken: 0, Reward received: 1
Current pair: B (with indicator 1) vs D (with indicator 0)
Action taken: 0, Reward received: 0
Current pair: H (with indicator 0) vs E (with indicator 1)
Action taken: 1, Reward received: 10
Current pair: D (with indicator 0) vs B (with ind