In [2]:

import numpy as np

import matplotlib.pyplot as plt

class NODE:
    def __init__(self, reward, correctProbability):
        self.REWARD = reward
        self.PROBABILITY = correctProbability
        self.next = None

class AGENT:
    def __init__(self):
        self.N = 10
        self.THETA = 0.0001
        self.DISCOUNT_FACTOR = 0.9
        self.ACTIONS = ["CONTINUE", "QUIT"]
        
        REWARDS = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, 5000000]
        CORRECT_ANSWER_PROBABILITY = [0.99, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
        
        START_NODE = NODE(-1,0)
        TEMP = START_NODE
        
        for i in range(self.N):  
            TEMP.next = NODE(REWARDS[i],CORRECT_ANSWER_PROBABILITY[i])
            TEMP = TEMP.next
        
        self.START_STATE = START_NODE.next        


class MDP_SOLUTION:
    def __init__(self):
        self.agent = AGENT()
        self.VALUE_FUNCTION = {s: 0 for s in range(self.agent.N)}
        self.ITERATIONS = 0
        self.TIMES_ENTERED = {s: 0 for s in range(self.agent.N)}
        self.TERMINATOR = False
        self.PLOT_STATES = [x for x in range(self.agent.N)]
    
    def helper(self, state, iteration):
        if(state == None): return 0
        self.TIMES_ENTERED[iteration] += 1
    
        OLD_VALUE = self.VALUE_FUNCTION[iteration]
        REWARD_ACHIEVED = 0
        if iteration == 0:
            QUIT_REWARD = 0
        else:
            QUIT_REWARD = self.VALUE_FUNCTION[iteration-1]
        
        ANSWER = np.random.rand()
        
        if ANSWER <= state.PROBABILITY:
            REWARD_ACHIEVED = state.PROBABILITY * (state.REWARD + (self.agent.DISCOUNT_FACTOR * self.helper(state.next, iteration+1)))
            self.VALUE_FUNCTION[iteration] = (self.VALUE_FUNCTION[iteration] * self.TIMES_ENTERED[iteration] + REWARD_ACHIEVED)/(self.TIMES_ENTERED[iteration]+1)
            
            if(abs(self.VALUE_FUNCTION[iteration] - OLD_VALUE) < self.agent.THETA):
                self.TERMINATOR = True

        return max(QUIT_REWARD, REWARD_ACHIEVED)
    
    def solver(self):
        while self.TERMINATOR == False:
            self.ITERATIONS += 1
            HEAD = self.agent.START_STATE
            self.helper(HEAD, 0)
        
        print("Total Iterations: ", self.ITERATIONS)
        print("VALUE FUNCTION:")
        print(self.VALUE_FUNCTION)
        
        for i in range(self.agent.N):
            self.TIMES_ENTERED[i] = (self.TIMES_ENTERED[i] / self.ITERATIONS) * 100
        
        print("TIMES ENTERED:")
        print(self.TIMES_ENTERED)
        
        EXPECTATION = 0
        for i in range(self.agent.N):
            EXPECTATION = EXPECTATION + ((self.TIMES_ENTERED[i]/100) * self.VALUE_FUNCTION[i])
        
        print("EXPECTED REWARD: ", EXPECTATION)
        

MDP_SOLUTION().solver()

Total Iterations:  6625
VALUE FUNCTION:
{0: 9473.398305163142, 1: 10377.756745551109, 2: 12627.657117756702, 3: 18149.10964969848, 4: 26888.101056763524, 5: 51042.565068385054, 6: 76514.78118309026, 7: 180786.20122217457, 8: 126830.12594201375, 9: 125850.34013605444}
TIMES ENTERED:
{0: 100.0, 1: 99.15471698113207, 2: 89.73584905660378, 3: 71.42641509433962, 4: 50.17358490566038, 5: 30.62641509433962, 6: 15.09433962264151, 7: 6.113207547169811, 8: 1.8415094339622642, 9: 0.3471698113207547}
EXPECTED REWARD:  98555.19917935529


In [4]:
import random

# Define the problem parameters
probabilities = [0.99, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
rewards = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, 5000000]

# Define the number of simulations to run
num_simulations = 10000

# Define a function to simulate the game and return the total reward
def simulate_game(stop_question):
    total_reward = 0
    for i in range(stop_question):
        if random.random() < probabilities[i]:
            total_reward += rewards[i]
        else:
            return -total_reward
    return total_reward

# Calculate the average reward for each question number
avg_rewards = []
for i in range(len(probabilities)):
    total_reward = 0
    for j in range(num_simulations):
        total_reward += simulate_game(i+1)
    avg_reward = total_reward / num_simulations
    avg_rewards.append(avg_reward)

# Find the question number that corresponds to the highest average reward
max_reward = max(avg_rewards)
max_question = avg_rewards.index(max_reward) + 1

# Print the result
print("The player should stop at question", max_question+1, "to maximize total reward.")

The player should stop at question 7 to maximize total reward.
