In [35]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

In [55]:
class PairedSlotMachineEnv(gym.Env):
    def __init__(self):
        super(PairedSlotMachineEnv, self).__init__()

        # Define action space: pick option 1 or option 2 in the current pair
        self.action_space = spaces.Discrete(2)
        
        # Observation space: represent each machine as an integer (0 to 7)
        # We return the observation as a NumPy array
        self.observation_space = spaces.MultiDiscrete([8, 8])

        # Define slot machine points and probabilities
        self.points = {
            'A': 10, 'B': 10, 'C': 1, 'D': 1,
            'E': 10, 'F': 10, 'G': 1, 'H': 1
        }
        self.probabilities = {
            'A': 0.75, 'B': 0.25, 'C': 0.75, 'D': 0.25,
            'E': 0.75, 'F': 0.25, 'G': 0.75, 'H': 0.25
        }

        # Original pairs for Learning Phase and new pairs for Transfer Phase
        self.learning_pairs = [('A', 'B'), ('C', 'D'), ('E', 'F'), ('G', 'H')]
        self.transfer_pairs = [('A', 'C'), ('B', 'D'), ('E', 'H'), ('F', 'G')]
        self.transfer_phase = False  # Start in learning phase by default
        self.current_pair = None

    def set_transfer_phase(self, transfer: bool):
        """Toggle between learning and transfer phases."""
        self.transfer_phase = transfer

    def reset(self, seed=None):
        super().reset(seed=seed)
        # Start with a random pair from the current phase
        self.current_pair = self._get_random_pair()
        # Return the initial observation as a NumPy array
        return np.array([self._get_machine_index(self.current_pair[0]), 
                         self._get_machine_index(self.current_pair[1])]), {}

    def _get_machine_index(self, machine):
        # Map machine labels to indices for the observation space
        machine_indices = {'A': 0, 'B': 1, 'C': 2, 'D': 3,
                           'E': 4, 'F': 5, 'G': 6, 'H': 7}
        return machine_indices[machine]

    def _get_random_pair(self):
        # Select from learning pairs or transfer pairs based on the phase
        pairs = self.transfer_pairs if self.transfer_phase else self.learning_pairs
        pair = pairs[np.random.randint(len(pairs))]
        # Randomly shuffle the order with 50% probability
        if np.random.rand() < 0.5:
            pair = pair[::-1]
        return pair

    def step(self, action):
        # Map the action (0 or 1) to the selected machine in the current pair
        chosen_machine = self.current_pair[action]
        
        # Determine the reward based on the chosen machine’s probability
        reward = self.points[chosen_machine] if np.random.rand() < self.probabilities[chosen_machine] else 0
        
        # Select a new random pair for the next step
        self.current_pair = self._get_random_pair()
        
        # Return the updated observation and reward
        observation = np.array([self._get_machine_index(self.current_pair[0]), 
                                self._get_machine_index(self.current_pair[1])])
        return observation, reward, False, False, {}

    def render(self):
        print(f"Current pair: {self.current_pair[0]} vs {self.current_pair[1]}")


In [56]:
# Initialize the environment
env = PairedSlotMachineEnv()
check_env(env)

# Create the PPO model
model = PPO("MlpPolicy", env, verbose=1, gamma=1)

# Train the model
model.learn(total_timesteps=20000)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 436  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 364         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008721656 |
|    clip_fraction        | 0.0264      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.689      |
|    explained_variance   | 0.00144     |
|    learning_rate        | 0.0003      |
|    loss                 | 940         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0104     |
|    value_loss        

<stable_baselines3.ppo.ppo.PPO at 0x232fa0d7470>

In [57]:
obs, _ = env.reset()
for _ in range(10):
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, _, info = env.step(action)
    print(f"Action taken: {action}, Reward received: {reward}")


Current pair: D vs C
Action taken: 1, Reward received: 1
Current pair: F vs E
Action taken: 1, Reward received: 0
Current pair: B vs A
Action taken: 1, Reward received: 0
Current pair: A vs B
Action taken: 0, Reward received: 10
Current pair: G vs H
Action taken: 1, Reward received: 1
Current pair: B vs A
Action taken: 1, Reward received: 10
Current pair: H vs G
Action taken: 1, Reward received: 1
Current pair: C vs D
Action taken: 0, Reward received: 1
Current pair: A vs B
Action taken: 0, Reward received: 10
Current pair: D vs C
Action taken: 1, Reward received: 0


In [58]:
# Switch to Transfer Phase
env.set_transfer_phase(True)
obs, _ = env.reset()

# Test agent in Transfer Phase
for _ in range(20):
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, _, info = env.step(action)
    print(f"Action taken: {action}, Reward received: {reward}")

Current pair: D vs B
Action taken: 0, Reward received: 0
Current pair: E vs H
Action taken: 0, Reward received: 10
Current pair: E vs H
Action taken: 0, Reward received: 0
Current pair: B vs D
Action taken: 1, Reward received: 1
Current pair: F vs G
Action taken: 1, Reward received: 0
Current pair: C vs A
Action taken: 1, Reward received: 10
Current pair: A vs C
Action taken: 0, Reward received: 10
Current pair: E vs H
Action taken: 0, Reward received: 0
Current pair: B vs D
Action taken: 1, Reward received: 0
Current pair: A vs C
Action taken: 0, Reward received: 10
Current pair: G vs F
Action taken: 0, Reward received: 1
Current pair: D vs B
Action taken: 0, Reward received: 0
Current pair: B vs D
Action taken: 1, Reward received: 1
Current pair: C vs A
Action taken: 1, Reward received: 10
Current pair: E vs H
Action taken: 0, Reward received: 0
Current pair: D vs B
Action taken: 0, Reward received: 0
Current pair: H vs E
Action taken: 1, Reward received: 10
Current pair: B vs D
Acti