<a href="https://colab.research.google.com/github/COSMIC5545/outfit-recommender/blob/main/OpenAIGym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This script is designed to be run in a Google Colab notebook.
# Please run these installation commands in a Colab cell first.

# !pip install gymnasium
# !pip install "gymnasium[classic_control,box2d]"
# !pip install pygame

import gymnasium as gym
import numpy as np
import random
import time

print("Gymnasium, NumPy, and Random imported successfully.")

# ---
# 2. Create an environment using OpenAI Gym (now Gymnasium)
# ---
# We'll use "FrozenLake-v1". It's a simple 4x4 grid.
# 'S' = Start, 'F' = Frozen, 'H' = Hole, 'G' = Goal
# The render_mode="ansi" gives us a text-based view.
#
# is_slippery=False makes the environment deterministic (action 'Right'
# always moves Right). This is MUCH easier for a basic Q-agent to learn.
# The default (True) is stochastic (action 'Right' might move Up or Down).
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="ansi")
print("Environment 'FrozenLake-v1' created.")

# ---
# 3. Define the states, actions, and rewards
# ---
# The environment defines these for us.
num_states = env.observation_space.n
num_actions = env.action_space.n

print(f"Number of States: {num_states}")   # 16 (for the 4x4 grid)
print(f"Number of Actions: {num_actions}")  # 4 (Left, Down, Right, Up)

# ---
# 4. Create a reinforcement learning agent using Q-learning
# ---

# Initialize the Q-table with all zeros
# Shape: (number of states, number of actions)
q_table = np.zeros((num_states, num_actions))

print("Q-Table initialized with shape:", q_table.shape)

# ---
# 5. Train the agent using the environment
# ---

print("Starting agent training...")

# Hyperparameters
total_episodes = 10000        # Total episodes to train
learning_rate = 0.1           # Alpha: How much we update Q-values
gamma = 0.99                  # Discount factor: Importance of future rewards
epsilon = 1.0                 # Exploration rate: Start by 100% exploring
max_epsilon = 1.0             # Maximum exploration rate
min_epsilon = 0.01            # Minimum exploration rate
epsilon_decay_rate = 0.0005   # How fast epsilon decreases

# List to store rewards (for monitoring)
training_rewards = []

for episode in range(total_episodes):
    # Reset the environment for a new episode
    # .reset() returns a tuple (initial_state, info), we just need the state
    state = env.reset()[0]

    terminated = False  # True when agent falls in a hole or reaches goal
    truncated = False   # True when episode ends due to time limit (not in this env)

    current_reward = 0

    while not terminated and not truncated:
        # --- Epsilon-greedy strategy ---
        # Decide whether to explore or exploit
        if random.uniform(0, 1) < epsilon:
            # Explore: Choose a random action
            action = env.action_space.sample()
        else:
            # Exploit: Choose the best action from Q-table
            action = np.argmax(q_table[state, :])

        # --- Take the action ---
        # .step() returns (new_state, reward, terminated, truncated, info)
        new_state, reward, terminated, truncated, info = env.step(action)

        # --- Q-learning update rule ---
        # Q(s,a) = Q(s,a) + lr * [R(s,a) + gamma * max(Q(s',a')) - Q(s,a)]
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
                                 learning_rate * (reward + gamma * np.max(q_table[new_state, :]))

        # Update our current state
        state = new_state
        current_reward += reward

    # --- End of episode ---

    # Decay epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate * episode)

    training_rewards.append(current_reward)

    if (episode + 1) % 1000 == 0:
        print(f"Training... Episode: {episode + 1}/{total_episodes}")

print("Training finished.")

# ---
# 6. Test the agent on the environment
# ---

print("\nTesting the trained agent...")
test_episodes = 100
total_wins = 0

for episode in range(test_episodes):
    state = env.reset()[0]
    terminated = False
    truncated = False

    # print(f"\n--- Test Episode {episode + 1} ---")

    while not terminated and not truncated:
        # In test mode, we ONLY exploit (no exploration)
        action = np.argmax(q_table[state, :])

        new_state, reward, terminated, truncated, info = env.step(action)

        state = new_state

        if terminated and reward == 1.0:
            total_wins += 1
            # print("Agent reached the goal! 🏆")
        # elif terminated:
            # print("Agent fell in a hole. 🕳️")

# ---
# Result
# ---
print("\n--- Test Results ---")
success_rate = (total_wins / test_episodes) * 100
print(f"Success Rate: {success_rate:.2f}%")
print(f"Total wins over {test_episodes} episodes: {total_wins}")


print("\n--- Final Q-Table ---")
# Print the Q-table values, rounded for readability
print(np.round(q_table, 3))

# ---
# Bonus: Watch the agent play one episode
# ---
print("\n--- Watching the agent play (text) ---")
state = env.reset()[0]
terminated = False
truncated = False
print(env.render()) # Show initial state
time.sleep(1)

while not terminated and not truncated:
    action = np.argmax(q_table[state, :])
    new_state, reward, terminated, truncated, info = env.step(action)
    state = new_state

    print(f"\nAction: {['Left', 'Down', 'Right', 'Up'][action]}")
    print(env.render())
    time.sleep(0.5)

if reward == 1.0:
    print("Agent reached the goal! 🏆")
else:
    print("Agent fell in a hole. 🕳️")

env.close()

Gymnasium, NumPy, and Random imported successfully.
Environment 'FrozenLake-v1' created.
Number of States: 16
Number of Actions: 4
Q-Table initialized with shape: (16, 4)
Starting agent training...
Training... Episode: 1000/10000
Training... Episode: 2000/10000
Training... Episode: 3000/10000
Training... Episode: 4000/10000
Training... Episode: 5000/10000
Training... Episode: 6000/10000
Training... Episode: 7000/10000
Training... Episode: 8000/10000
Training... Episode: 9000/10000
Training... Episode: 10000/10000
Training finished.

Testing the trained agent...

--- Test Results ---
Success Rate: 100.00%
Total wins over 100 episodes: 100

--- Final Q-Table ---
[[0.941 0.951 0.951 0.941]
 [0.941 0.    0.961 0.948]
 [0.942 0.97  0.845 0.936]
 [0.937 0.    0.098 0.043]
 [0.951 0.961 0.    0.941]
 [0.    0.    0.    0.   ]
 [0.    0.98  0.    0.956]
 [0.    0.    0.    0.   ]
 [0.961 0.    0.97  0.951]
 [0.961 0.98  0.98  0.   ]
 [0.97  0.99  0.    0.97 ]
 [0.    0.    0.    0.   ]
 [0.   