# =========================Libraries ============================

In [1]:
import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

# ======================== Parameters ==========================

In [2]:
num_episodes = 10000
num_iters    = 100
gamma        = 0.9

epsilon      = 0.1

epsilon_flage = 1

# ======================== Functions ==========================

In [3]:
def policy(state):
    """
    A policy that sticks if the player score is >= 20 and hits otherwise.
    """
    return 0 if state >= 20 else 1

In [4]:
def policy_epsilon(state,nA,epsilon):
    probs           = np.ones(nA, dtype=float) * epsilon / nA
    
    if state >= 20:
        best_action_idx = 0
    else:
        best_action_idx = 1    
        
    probs[best_action_idx] += (1.0 - epsilon)
    
    action = np.random.choice(np.arange(len(probs)), p=probs)
    return action

In [8]:
def estimate_Value_function(env,num_episodes,num_iters,gamma):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps an observation to action probabilities.
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """
    State_episode_return = defaultdict(float)
    State_episode_visit  = defaultdict(float)
    
    V = defaultdict(float)
    
    for ep_idx in range(num_episodes):
        
        episode = []
        state = env.reset()
        for t in range(num_iters):
            if epsilon_flage:
                action = policy_epsilon(state[0],env.nA,epsilon)
            else:
                action = policy(state[0])
            action = policy(state[0])
            next_state, reward, done, _ = env.step(action)
            episode.append((state[0],action,reward))
            if done:
                break
            state = next_state
            
        episode_states = set([x[0] for x in episode])
        for s in episode_states:
            first_occurence_idx = [x[0] for x in episode].index(s)
            G = 0
            for i,el in enumerate(episode[first_occurence_idx:]):
                G += el[2]*(gamma**i)
                
            State_episode_return[s] += G
            State_episode_visit[s]  += 1
            V[s] = State_episode_return[s]/State_episode_visit[s]

    return V

# =========================== Main =============================

In [9]:
env = BlackjackEnv()

In [10]:
V_10k = estimate_Value_function(env,num_episodes,num_iters,gamma)

In [11]:
V_10k

defaultdict(float,
            {16: -0.6073982056590741,
             18: -0.6936690551680401,
             12: -0.5277689878542475,
             17: -0.6530666891436271,
             20: 0.6147086031452359,
             21: 0.8881028938906752,
             14: -0.5831408161764683,
             15: -0.6338699210337378,
             13: -0.5468993982169366,
             19: -0.6996656151419548})