In [None]:
import urllib.request

url = "https://raw.githubusercontent.com/udacity/deep-reinforcement-learning/master/monte-carlo/plot_utils.py"
urllib.request.urlretrieve(url, "plot_utils.py")

import gym
import numpy as np
import sys
from collections import defaultdict
from plot_utils import plot_blackjack_values

env = gym.make('Blackjack-v1')

def generate_episode_from_limit_stochastic(env):
    episode = []
    state, _ = env.reset()
    while True:
        probs = [0.8, 0.2] if state[0] > 18 else [0.2, 0.8]
        action = np.random.choice(np.arange(2), p=probs)
        next_state, reward, done, _, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode

def mc_prediction_q(env, num_episodes, generate_episode, gamma=1.0):
    returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    for i_episode in range(1, num_episodes + 1):
        if i_episode % 1000 == 0:
            print(f"\rEpisode {i_episode}/{num_episodes}", end="")
            sys.stdout.flush()

        episode = generate_episode(env)
        states, actions, rewards = zip(*episode)
        G = 0

        for t in reversed(range(len(episode))):
            G = gamma * G + rewards[t]
            if (states[t], actions[t]) not in list(zip(states[:t], actions[:t])):
                returns_sum[states[t]][actions[t]] += G
                N[states[t]][actions[t]] += 1
                Q[states[t]][actions[t]] = returns_sum[states[t]][actions[t]] / N[states[t]][actions[t]]

    return Q

Q = mc_prediction_q(env, 500000, generate_episode_from_limit_stochastic)
V = dict((k, (k[0] > 18) * np.dot([0.8, 0.2], v) + (k[0] <= 18) * np.dot([0.2, 0.8], v)) for k, v in Q.items())
plot_blackjack_values(V)
