In [46]:
import numpy as np

In [47]:
# Define the problem parameters
MAX_OFFER = 10  # Maximum possible offer
MAINTENANCE_COST = 0.1  # Maintenance cost per day
OFFER_PROBABILITIES = [0.0, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.3, 0.1]  # Probability of receiving each offer
NUM_OFFERS = len(OFFER_PROBABILITIES)  # Number of distinct offers


In [48]:
def prob_transition_matrix(offer_probabilities: list[float], max_offer: int) -> list[list[float]]:
    transition_probability_matrix = [[0] * (max_offer + 1) for _ in range(max_offer + 1)]
    for i in range(max_offer + 1):
        for j in range(max_offer + 1):
            if j < i:
                transition_probability_matrix[i][j] = 0
            elif j == i:
                transition_probability_matrix[i][j] = sum(offer_probabilities[:min(j + 1, NUM_OFFERS)])
            else:
                if j < NUM_OFFERS:  # Ensure j is within the range of offer_probabilities
                    transition_probability_matrix[i][j] = offer_probabilities[j]
                else:
                    transition_probability_matrix[i][j] = 0
    return transition_probability_matrix

In [49]:
def value_iteration(gamma, theta=1e-6):
    # Initialize the value function, Q-matrix, and policy
    V = np.zeros(MAX_OFFER + 1)
    Q = np.zeros((MAX_OFFER + 1, 2))  # 2 actions: 0 for continue, 1 for stop
    policy = np.zeros(MAX_OFFER + 1, dtype=int)

    # Initialize V and policy
    for i in range(MAX_OFFER + 1):
        V[i] = -i
        policy[i] = 1

    P = prob_transition_matrix(OFFER_PROBABILITIES, MAX_OFFER)
    iter_num = 0

    while True:
        iter_num += 1
        delta = 0

        # Loop through all possible states (offers)
        for state in range(MAX_OFFER + 1):
            v = V[state]

            # Calculate Q-values for both actions
            # Action 0: Continue
            Q[state, 0] = MAINTENANCE_COST + gamma * sum(P[state][offer] * V[min(offer, MAX_OFFER)] for offer in range(NUM_OFFERS))

            # Action 1: Stop
            Q[state, 1] = -state

            # Update value function
            V[state] = min(Q[state, 0], Q[state, 1])

            # Update policy
            policy[state] = np.argmin(Q[state])

            # Update delta for convergence check
            delta = max(delta, abs(v - V[state]))

        # Check for convergence
        if delta < theta:
            break

    return V, Q, policy, iter_num


In [50]:

v_value, Q, v_policy, num_iter = value_iteration(0.5)
print(f"Value Function:", v_value)
print(f"Q Matrix:", Q)
print(f"Policy:", v_policy)
print("Iterations:", num_iter)

Value Function: [ -3.0588235   -3.0588235   -3.0588235   -3.05882351  -4.
  -5.          -6.          -7.          -8.          -9.
 -10.        ]
Q Matrix: [[ -3.0588235    0.        ]
 [ -3.0588235   -1.        ]
 [ -3.0588235   -2.        ]
 [ -3.05882351  -3.        ]
 [ -3.2         -4.        ]
 [ -3.4         -5.        ]
 [ -3.625       -6.        ]
 [ -3.875       -7.        ]
 [ -4.15        -8.        ]
 [ -4.45        -9.        ]
 [ -4.9        -10.        ]]
Policy: [0 0 0 0 1 1 1 1 1 1 1]
Iterations: 8


In [51]:
# Create a list of floats starting from 0.1 to 0.9 with step size 0.1
list_of_gamma = [(0.1 * i).__round__(1) for i in range(1, 11)]

for gamma in list_of_gamma:
    v_value, Q, v_policy, num_iter = value_iteration(gamma)
    print(f"Value Function for gamma= {gamma}: {v_value}", end=" ")
    print(f"Policy for gamma= {gamma}:{v_policy}", end=" ")
    print(f"Iterations to converge: {num_iter}")

Value Function for gamma= 0.1: [ -0.5  -1.   -2.   -3.   -4.   -5.   -6.   -7.   -8.   -9.  -10. ] Policy for gamma= 0.1:[0 1 1 1 1 1 1 1 1 1 1] Iterations to converge: 2
Value Function for gamma= 0.2: [ -1.1020408  -1.1020408  -2.         -3.         -4.         -5.
  -6.         -7.         -8.         -9.        -10.       ] Policy for gamma= 0.2:[0 0 1 1 1 1 1 1 1 1 1] Iterations to converge: 4
Value Function for gamma= 0.3: [ -1.72164947  -1.72164947  -2.          -3.          -4.
  -5.          -6.          -7.          -8.          -9.
 -10.        ] Policy for gamma= 0.3:[0 0 1 1 1 1 1 1 1 1 1] Iterations to converge: 5
Value Function for gamma= 0.4: [ -2.36956521  -2.36956521  -2.36956521  -3.          -4.
  -5.          -6.          -7.          -8.          -9.
 -10.        ] Policy for gamma= 0.4:[0 0 0 1 1 1 1 1 1 1 1] Iterations to converge: 7
Value Function for gamma= 0.5: [ -3.0588235   -3.0588235   -3.0588235   -3.05882351  -4.
  -5.          -6.          -7.          

In [52]:
import random

def policy_iteration(gamma, theta=1e-6):
    # Initialize the value function and policy
    V = np.zeros(MAX_OFFER + 1)
    for i in range(MAX_OFFER+1):
        V[i] = -i
    policy = np.zeros(MAX_OFFER + 1, dtype=int)
    for i in range(MAX_OFFER+1):
        policy[i] = 1
    # policy = np.random.randint(2, size=MAX_OFFER + 1)  # Initialize policy randomly
    # print(policy)
    # P = prob_transition_matrix(OFFER_PROBABILITIES, MAX_OFFER)
    
    iter_num = 0
    while True:
        # Policy Evaluation
        iter_num+=1
        # print(f"At Iteration Number {iter_num} of Policy Iteration")
        while True:
            delta = 0
            for state in range(MAX_OFFER + 1):
                v = V[state]
                
                if policy[state] == 1:  # If the policy is to stop
                    V[state] = -state
                else:  # If the policy is to continue
                    continue_value = gamma*sum(OFFER_PROBABILITIES[offer] * V[min(offer, MAX_OFFER)] for offer in range(NUM_OFFERS)) + MAINTENANCE_COST
                    # print(continue_value)
                    V[state] = continue_value
                
                delta = max(delta, abs(v - V[state]))
            
            # Check for convergence
            if delta < theta:
                break
        
        policy_stable = True
        
        # Policy Improvement
        for state in range(MAX_OFFER + 1):
            old_action = policy[state]
            optimal_action_value = -state  # Value if we stop selling
            continue_value = gamma*sum(OFFER_PROBABILITIES[offer] * V[min(offer, MAX_OFFER)] for offer in range(NUM_OFFERS)) + MAINTENANCE_COST
            
            if continue_value < optimal_action_value:
                policy[state] = 0  # Continue
            else:
                policy[state] = 1  # Stop
            
            # Check if the policy has changed
            if old_action != policy[state]:
                policy_stable = False
        
        # print(f"Policy at iteration {iter_num}: {policy}")
        # If the policy is stable, we're done
        if policy_stable:
            break
    
    return V, policy, iter_num


In [53]:
print("\nPolicy Iteration:")
p_value, p_policy, num_iter = policy_iteration(0.5)
print("Value Function:", p_value)
print("Policy:", p_policy)
print("Iterations:", num_iter)


Policy Iteration:
Value Function: [ -3.05882349  -3.05882349  -3.05882351  -3.05882352  -4.
  -5.          -6.          -7.          -8.          -9.
 -10.        ]
Policy: [0 0 0 0 1 1 1 1 1 1 1]
Iterations: 3


In [None]:
MAX_OFFER = 10
MAINTENANCE_COST = 0.1
OFFER_PROBABILITIES = [0.0, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.3, 0.1]
NUM_OFFERS = len(OFFER_PROBABILITIES)

In [54]:
# Create a list of floats starting from 0.1 to 0.9 with step size 0.1
list_of_gamma = [(0.1 * i).__round__(1) for i in range(1, 11)]

for gamma in list_of_gamma:
    p_value, p_policy, num_iter = policy_iteration(gamma)
    # print(f"Value Function for gamma= {gamma}: {p_value}", end=" ")
    print(f"Policy for gamma= {gamma}:{p_policy}", end=" ")
    print(f"Iterations to converge: {num_iter}")

Policy for gamma= 0.1:[0 1 1 1 1 1 1 1 1 1 1] Iterations to converge: 2
Policy for gamma= 0.2:[0 0 1 1 1 1 1 1 1 1 1] Iterations to converge: 2
Policy for gamma= 0.3:[0 0 1 1 1 1 1 1 1 1 1] Iterations to converge: 2
Policy for gamma= 0.4:[0 0 0 1 1 1 1 1 1 1 1] Iterations to converge: 2
Policy for gamma= 0.5:[0 0 0 0 1 1 1 1 1 1 1] Iterations to converge: 3
Policy for gamma= 0.6:[0 0 0 0 1 1 1 1 1 1 1] Iterations to converge: 2
Policy for gamma= 0.7:[0 0 0 0 0 1 1 1 1 1 1] Iterations to converge: 2
Policy for gamma= 0.8:[0 0 0 0 0 0 1 1 1 1 1] Iterations to converge: 3
Policy for gamma= 0.9:[0 0 0 0 0 0 0 0 1 1 1] Iterations to converge: 3
Policy for gamma= 1.0:[0 0 0 0 0 0 0 0 0 1 1] Iterations to converge: 3
