In [None]:
def Gt(episode, state):
    """
    
    gt from first occurence of state
    
    An episode is an array of (state, action, reward) tuples.
    we can call it reaction.
    
    so for x in episode:
    x[0] means state, x[2] means reward
    
    """
    first_occurence_idx = next(i for i,x in enumerate(episode) if x[0] == state)
    
    # reactions after first occurence of state
    reactions = episode[first_occurence_idx:]
    
    # Sum up all rewards since the first occurance
    return sum([x[2]*(discount_factor**i) for i,x in enumerate(reactions)])
            
    
    

In [None]:
def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps 
        an observation to action probabilities.
        policy[s] should be a list of float,
        
        BUT, in this case, it is a deterministic number
        
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final value function
    V = defaultdict(float)
    
    for i_episode in range(1, num_episodes + 1):
        
        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        state = env.reset()
        for t in range(100):
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        # Find all states the we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        states_in_episode = set([tuple(x[0]) for x in episode])
        for state in states_in_episode:
            returns_sum[state] += Gt(episode, state)
            returns_count[state] += 1.0
            
            # here we could alternatively calculate this out of the loop
            V[state] = returns_sum[state] / returns_count[state]

    return V