In [10]:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np

from collections import defaultdict
import sys
import random
from tqdm import tqdm, tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
import ray
from ray import tune

In [4]:
from blackjack_count import BlackjackEnv

In [5]:
env = BlackjackEnv(natural=True)

In [7]:
# возвращаем вероятность лучшего действия согласно epsilon
def get_probs(Q_s, epsilon, nA):
    policy_s = np.ones(nA) * epsilon / nA
    best_a = np.argmax(Q_s)
    policy_s[best_a] = 1 - epsilon + (epsilon / nA)
    return policy_s

# обновляем Q
def update_Q(env, episode, Q, alpha, gamma):
    for s, a, r in episode:
        first_occurence_idx = next(i for i,x in enumerate(episode) if x[0] == s)
        G = sum([x[2]*(gamma**i) for i,x in enumerate(episode[first_occurence_idx:])])
        Q[s][a] = Q[s][a] + alpha*(G - Q[s][a])
    return Q

# policy
def mc_control_policy(Q, state, epsilon, nA):
    if state in Q:
        return np.random.choice(np.arange(nA), p=get_probs(Q[state], epsilon, nA))
    else:
        return None
    
# новый генератор эпизодов
def generate_episode_from_Q(env, Q, epsilon, nA):
    episode = []
    state = env.reset()
    while True:
        
        action = mc_control_policy(Q, state, epsilon, nA)
        if action is None:
            action = env.action_space.sample()
            
        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode

In [18]:
# Monte-Carlo control
def mc_control(env, num_episodes, alpha, gamma=1.0, eps_start=1.0, eps_decay=.99995, eps_min=0.01):
    nA = env.action_space.n
    Q = defaultdict(lambda: np.zeros(nA))
    epsilon = eps_start
    cum_reward = []
    reward_hist = []
    for i_episode in range(1, num_episodes+1):
        #if i_episode % 1000 == 0:
        #    print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
        #    sys.stdout.flush()
        
        epsilon = max(epsilon*eps_decay, eps_min)
        episode = generate_episode_from_Q(env, Q, epsilon, nA)
        cum_reward.append(episode[-1][-1])
        Q = update_Q(env, episode, Q, alpha, gamma)
        
        if i_episode % 100000 == 0:
            avg_reward = []
            for i in range(100000):#tqdm_notebook(range(100000)):
                episode = generate_episode_from_Q(env, Q, .0, nA)
                avg_reward.append(episode[-1][-1])
            reward_hist.append(np.mean(avg_reward))
    policy = dict((k,np.argmax(v)) for k, v in Q.items())
    return policy, Q, cum_reward, reward_hist

In [13]:
%%time
policy, Q, cum_reward, reward_hist = mc_control(env, num_episodes = 100000, alpha = 0.02, 
                                                gamma=.90, eps_start=1.0, eps_decay=.999995, eps_min=0.01)

Episode 50000/100000.

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


Episode 100000/100000.

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


CPU times: user 1min 52s, sys: 8.6 s, total: 2min 1s
Wall time: 2min 3s


In [15]:
reward_hist

[-0.084705, -0.051925]

In [24]:
def objective(alpha, gamma, eps_decay):
    policy, Q, cum_reward, reward_hist = mc_control(env, 
                                                    num_episodes = 500000, 
                                                    alpha = alpha, 
                                                    gamma = gamma, 
                                                    eps_start = 1.0, 
                                                    eps_decay = eps_decay, 
                                                    eps_min = 0.01)
    return reward_hist

def training_function(config):
    alpha, gamma, eps_decay = config["alpha"], config["gamma"], config['eps_decay']
    reward_hist = objective(alpha, gamma, eps_decay)
    for i in reward_hist:
        tune.report(mean_loss = i)

In [26]:
analysis = tune.run(
    training_function,
    config={
        "alpha": tune.grid_search(list(np.arange(0, 1, 0.05))),
        "gamma": tune.grid_search(list(np.arange(0.9, 1, 0.01))),
        "eps_decay": tune.grid_search([.99995, .999995, .9999995])
    })

Trial name,status,loc,alpha,eps_decay,gamma
training_function_fc5ed_00001,PENDING,,0.05,0.99995,0.9
training_function_fc5ed_00002,PENDING,,0.1,0.99995,0.9
training_function_fc5ed_00003,PENDING,,0.15,0.99995,0.9
training_function_fc5ed_00004,PENDING,,0.2,0.99995,0.9
training_function_fc5ed_00005,PENDING,,0.25,0.99995,0.9
training_function_fc5ed_00006,PENDING,,0.3,0.99995,0.9
training_function_fc5ed_00007,PENDING,,0.35,0.99995,0.9
training_function_fc5ed_00008,PENDING,,0.4,0.99995,0.9
training_function_fc5ed_00009,PENDING,,0.45,0.99995,0.9
training_function_fc5ed_00010,PENDING,,0.5,0.99995,0.9


Episode 1000/500000.[0m 
Episode 1000/500000.[0m 
Episode 1000/500000.[0m 
Episode 2000/500000.[0m 
Episode 2000/500000.[0m 
Episode 2000/500000.[0m 
Episode 3000/500000.[0m 
Episode 3000/500000.[0m 
Episode 3000/500000.[0m 
Episode 1000/500000.[0m 
Episode 4000/500000.[0m 
Episode 4000/500000.[0m 
Episode 4000/500000.[0m 
Episode 2000/500000.[0m 
Episode 5000/500000.[0m 
Episode 5000/500000.[0m 
Episode 5000/500000.[0m 
Episode 3000/500000.[0m 
Episode 6000/500000.[0m 
Episode 6000/500000.[0m 
Episode 6000/500000.[0m 
Episode 4000/500000.[0m 
Episode 7000/500000.[0m 
Episode 7000/500000.[0m 
Episode 7000/500000.[0m 
Episode 5000/500000.[0m 
Episode 8000/500000.[0m 
Episode 8000/500000.[0m 
Episode 8000/500000.[0m 
Episode 6000/500000.[0m 
Episode 9000/500000.[0m 
Episode 9000/500000.[0m 
Episode 9000/500000.[0m 
Episode 7000/500000.[0m 
Episode 10000/500000.[0m 
Episode 10000/500000.[0m 
Episode 10000/500000.[0m 
Episode 8000/500000.[0m 
Episode 1100

KeyboardInterrupt: 

In [2]:
print("Best config: ", analysis.get_best_config(metric="mean_loss", mode="min"))

# Get a dataframe for analyzing trial results.
df = analysis.results_df

2020-11-04 10:53:08,321	INFO services.py:1166 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Trial name,status,loc,alpha,beta
training_function_c8018_00000,RUNNING,,0.001,1
training_function_c8018_00001,PENDING,,0.01,1
training_function_c8018_00002,PENDING,,0.1,2


Result for training_function_c8018_00002:
  date: 2020-11-04_10-53-12
  done: false
  experiment_id: 79beacd130ee4aaab426871128b14772
  experiment_tag: 2_alpha=0.1,beta=2
  hostname: MacBook-Air-Dima.local
  iterations_since_restore: 1
  mean_loss: 10.2
  neg_mean_loss: -10.2
  node_ip: 192.168.1.129
  pid: 93587
  time_since_restore: 0.0024771690368652344
  time_this_iter_s: 0.0024771690368652344
  time_total_s: 0.0024771690368652344
  timestamp: 1604476392
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c8018_00002
  
Result for training_function_c8018_00000:
  date: 2020-11-04_10-53-12
  done: false
  experiment_id: a54b0093c54c4aa4b866b10ba4e6a5f5
  experiment_tag: 0_alpha=0.001,beta=1
  hostname: MacBook-Air-Dima.local
  iterations_since_restore: 1
  mean_loss: 10.1
  neg_mean_loss: -10.1
  node_ip: 192.168.1.129
  pid: 93590
  time_since_restore: 0.0018069744110107422
  time_this_iter_s: 0.0018069744110107422
  time_total_s: 0.0018069744110107422
  timestamp: 160

Trial name,status,loc,alpha,beta,loss,iter,total time (s),neg_mean_loss
training_function_c8018_00000,TERMINATED,,0.001,1,10.091,10,0.13094,-10.091
training_function_c8018_00001,TERMINATED,,0.01,1,10.0108,10,0.181395,-10.0108
training_function_c8018_00002,TERMINATED,,0.1,2,9.37431,10,0.124194,-9.37431


Best config:  {'alpha': 0.1, 'beta': 2}
