ARPIT S. BURLEY                                                          
D16AD                                                                    
09                                    

In [None]:
import numpy as np
import random

# Initialize environment with a small state space for simplicity
num_states = 5   # Example with 5 states (0 to 4)
num_actions = 2  # Two possible actions (e.g., left or right)

# Q-tables for Double Q-Learning (two separate tables to reduce maximization bias)
Q1 = np.zeros((num_states, num_actions))
Q2 = np.zeros((num_states, num_actions))

# Hyperparameters
alpha = 0.1  # Learning rate: How much new information overrides the old
gamma = 0.9  # Discount factor: Importance of future rewards
epsilon = 0.1  # Exploration probability: Chance of choosing a random action (for exploration)

def choose_action(state):
    """
    Choose an action based on epsilon-greedy strategy:
    - With probability epsilon, choose a random action (exploration)
    - Otherwise, choose the action with the highest estimated value from Q1 + Q2 (exploitation)
    """
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(num_actions))  # Explore (random action)
    else:
        return np.argmax(Q1[state] + Q2[state])  # Exploit (best estimated action)

# Simulating learning over episodes
for episode in range(1000):
    state = random.randint(0, num_states - 1)  # Start at a random state
    next_state = (state + 1) % num_states  # Example transition: Move to the next state in a circular manner
    reward = 1 if next_state == num_states - 1 else 0  # Reward for reaching the last state

    action = choose_action(state)  # Choose an action based on Q-values and epsilon-greedy strategy

    # Randomly decide which Q-table to update (reduces overestimation bias)
    if random.random() < 0.5:
        best_next_action = np.argmax(Q1[next_state])  # Choose best action from Q1 for next state
        Q1[state, action] += alpha * (reward + gamma * Q2[next_state, best_next_action] - Q1[state, action])
    else:
        best_next_action = np.argmax(Q2[next_state])  # Choose best action from Q2 for next state
        Q2[state, action] += alpha * (reward + gamma * Q1[next_state, best_next_action] - Q2[state, action])

# Print the learned Q-values for both tables and their average estimate
print("Q1 Table:\n", Q1)
print("Q2 Table:\n", Q2)
print("Final Q-Value Estimate (Averaged):\n", (Q1 + Q2) / 2)

Q1 Table:
 [[0.99934521 0.12009005]
 [1.18318636 0.15225754]
 [1.44406667 0.18812757]
 [1.67918487 0.38666038]
 [0.85136662 0.14297449]]
Q2 Table:
 [[1.01088778 0.44127548]
 [1.2021054  0.39335903]
 [1.42142338 0.2310557 ]
 [1.69756098 0.19762396]
 [0.83322835 0.17433988]]
Final Q-Value Estimate (Averaged):
 [[1.0051165  0.28068276]
 [1.19264588 0.27280829]
 [1.43274502 0.20959164]
 [1.68837293 0.29214217]
 [0.84229748 0.15865719]]


In [None]:
import numpy as np
import random

# Define a restaurant rating environment
num_states = 5  # Example: 5 different customer satisfaction levels
num_actions = 3  # Actions: (0) Improve food, (1) Improve service, (2) Improve ambiance)

# Q-tables for Double Q-Learning
Q1 = np.zeros((num_states, num_actions))
Q2 = np.zeros((num_states, num_actions))

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration probability

def choose_action(state):
    """
    Choose an action based on epsilon-greedy strategy:
    - With probability epsilon, choose a random action (exploration)
    - Otherwise, choose the action with the highest estimated value from Q1 + Q2 (exploitation)
    """
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(num_actions))  # Explore (random action)
    else:
        return np.argmax(Q1[state] + Q2[state])  # Exploit (best estimated action)

# Simulating learning over episodes
for episode in range(1000):
    state = random.randint(0, num_states - 1)  # Start at a random customer satisfaction level
    action = choose_action(state)  # Choose an action (food, service, or ambiance improvement)

    # Define next state and reward logic
    if action == 0:  # Improve food
        next_state = min(state + 1, num_states - 1)  # Food improvement increases satisfaction
    elif action == 1:  # Improve service
        next_state = min(state + 2, num_states - 1)  # Service improvement gives bigger boost
    else:  # Improve ambiance
        next_state = max(state - 1, 0)  # Ambiance change might not always be positive

    reward = next_state  # Higher satisfaction states give better rewards

    # Randomly decide which Q-table to update
    if random.random() < 0.5:
        best_next_action = np.argmax(Q1[next_state])
        Q1[state, action] += alpha * (reward + gamma * Q2[next_state, best_next_action] - Q1[state, action])
    else:
        best_next_action = np.argmax(Q2[next_state])
        Q2[state, action] += alpha * (reward + gamma * Q1[next_state, best_next_action] - Q2[state, action])

# Print the learned Q-values for both tables and their average estimate
print("\n--- Final Learned Q-Values ---")
print("\nQ1 Table (Action-Value Estimates from Q1):\n", Q1)
print("\nQ2 Table (Action-Value Estimates from Q2):\n", Q2)
print("\nFinal Averaged Q-Values (Best Policy Estimate):\n", (Q1 + Q2) / 2)
print("\nLegend:")
print("- Row index represents customer satisfaction levels (0 = Very Dissatisfied, 4 = Very Satisfied)")
print("- Columns represent actions: 0 = Improve Food, 1 = Improve Service, 2 = Improve Ambiance")
print("- Higher values indicate better long-term action choices for increasing customer satisfaction")


--- Final Learned Q-Values ---

Q1 Table (Action-Value Estimates from Q1):
 [[18.77712542  3.05158369  4.2272931 ]
 [21.75682471  7.63658646  2.39810902]
 [24.23429419  7.5455421   1.25692048]
 [24.62552034  4.11215675  2.00764103]
 [24.59466828  8.28032217  5.16736455]]

Q2 Table (Action-Value Estimates from Q2):
 [[19.415341    7.17767576  0.93932024]
 [21.61876277  6.138231    0.        ]
 [23.46245133  8.08255041  2.17283571]
 [24.84413542  3.14933171  5.27710152]
 [24.43733785  8.51632715  1.54229379]]

Final Averaged Q-Values (Best Policy Estimate):
 [[19.09623321  5.11462972  2.58330667]
 [21.68779374  6.88740873  1.19905451]
 [23.84837276  7.81404626  1.71487809]
 [24.73482788  3.63074423  3.64237127]
 [24.51600307  8.39832466  3.35482917]]

Legend:
- Row index represents customer satisfaction levels (0 = Very Dissatisfied, 4 = Very Satisfied)
- Columns represent actions: 0 = Improve Food, 1 = Improve Service, 2 = Improve Ambiance
- Higher values indicate better long-term acti