In [7]:
import time
from tqdm import tqdm
import gym
import numpy as np
env = gym.make('FrozenLake-v0')

In [8]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print (method.__name__, (te - ts) * 1000)
        return result
    return timed

In [9]:
@timeit
def get_all_employee_details(*args, **kwargs):
    x = 0
    for i in range(10000):
        x+=2
    print (x)
    
@timeit
def tests2(**kwargs):
    x = 0
    for i in range(200000):
        x+=2
    print (x)

In [4]:
logtime_data ={}
employees = get_all_employee_details(21, 32, 150, log_time=logtime_data)
a = tests2(log_time=logtime_data)
b = tests2()

20000
400000
400000
tests2 32.98521041870117


In [5]:
logtime_data

{'GET_ALL_EMPLOYEE_DETAILS': 0, 'TESTS2': 60}

In [17]:
#################################################################################
#  Use these variables to change the RL algorithm and the exploration strategy  #
#################################################################################
# Reinforcement Learning algorithm (SARSA or qlearning)
qlearning = True
# Exploration strategy (epsilon-greedy or softmax)
softMax = True
#################################################################################


nb_episodes = 10000
alpha = 0.4
gamma = 0.999
epsilon = 0.7
tau = 0.003
q_table = np.ones((16, 4))  # 16 = 4x4 grid; 4 = [left, down, right, up]
results = []

def getAction(env, q_table, observation):
    if softMax:
        #rand = np.random.uniform(0, 1) * np.sum(np.exp(q_table[observation] / tau))
        #cumulate = 0
        #i = 0
        #while cumulate < rand and i < len(q_table[observation]):
        #    cumulate += np.exp(q_table[observation][i] / tau)
        #    i += 1
        #return i

        # we use np.choice instead
        elements = [0, 1, 2, 3]
        probabilities = np.array([np.exp(q_table[observation][i] / tau) for i in range(len(q_table[observation]))])
        probabilities /= np.sum(np.exp(q_table[observation] / tau))
        return np.random.choice(elements, 1, p=probabilities)[0]
    else:
        if np.random.uniform(0, 1) > epsilon:
            # we take the best one seen so far
            act = np.argmax(q_table[observation])
        else:
            act = env.action_space.sample()  # gets an action randomly
        return act


for i_episode in tqdm(range(nb_episodes)):
    observation = env.reset()
    action = getAction(env, q_table, observation)

    for t in range(200):
        # we do the first step
        observation_2, reward, done, info = env.step(action)

        # then we do the second step VIRTUALLY
        action_2 = getAction(env, q_table, observation_2)

        # we update Q
        error = reward - q_table[observation, action]
        if not done:
            if qlearning:
                error += gamma * np.max(q_table[observation_2])
            else:
                error += gamma * q_table[observation_2, action_2]


        q_table[observation, action] += alpha * error

        if done:
            results.append(observation_2)
            break

        # we set observation to the next step
        observation = observation_2
        action = action_2

print("State space dimension is:", env.observation_space.n)
#print("State upper bounds:", env.observation_space.high)
#print("State lower bounds:", env.observation_space.low)
print("Number of actions is:", env.action_space.n)
window = 100
unique, counts = np.unique(results[-window:], return_counts=True)
if unique[-1] == 15:
    print("Number of successes = {} out of the {} last ones".format(counts[-1], window))
else:
    print("No success")



  0%|                                                                                                                                                      | 0/10000 [00:00<?, ?it/s]

  0%|▏                                                                                                                                           | 11/10000 [00:00<01:32, 107.71it/s]

  0%|▍                                                                                                                                           | 29/10000 [00:00<01:24, 117.91it/s]

  0%|▌                                                                                                                                           | 42/10000 [00:00<01:25, 116.75it/s]

  1%|▋                                                                                                                                           | 51/10000 [00:00<01:36, 102.81it/s]

  1%|▉                                                                             

State space dimension is: 16
Number of actions is: 4
Number of successes = 55 out of the 100 last ones
