Learning TD-Learning

SARSA

epsilon_greedy(q_func, state, actions):
    if random.random() > EPSILON:
        actions_dict = {action: q_func[state[0], state[1], action] for action in actions}
        action = max(actions_dict, key=actions_dict.get)
        return action

    action = random.choice(actions)
    return action


def run_sarsa(q_func, env):
    times = 100000
    actions = env.action_values()
    episodes = 1
    total_return = 0
    state = env.reset()
    for i in range(times):
        action = epsilon_greedy(q_func, state, actions)
        next_state, reward, done = env.step(action)

        # Learning
        if not done:
            next_action = epsilon_greedy(q_func, next_state, actions)
            new_q = q_func[state[0], state[1], action] + ALPHA * (reward + GAMMA * q_func[next_state[0], next_state[1], next_action] - q_func[state[0], state[1], action])
            q_func[state[0], state[1], action] = new_q
        else:
            new_q = q_func[state[0], state[1], action] + ALPHA * (reward - q_func[state[0], state[1], action])
            q_func[state[0], state[1], action] = new_q

        # render + metrics
        if i % 20 == 0 or episodes > 250:
            env.render()

        total_return += reward
        print(f'\r[iter {i}, episode {episodes} step {env.last_counter}] state: {state}, reward: {total_return}, done: {done}',
              end='')

        if not done:
            state = next_state
        else:
            # End of episode
            episodes += 1
            total_return = 0
            state = env.reset()
            print()

Q-Learning

def epsilon_greedy(q_func, state, actions):
    if random.random() > EPSILON:
        actions_dict = {action: q_func[state[0], state[1], action] for action in actions}
        action = max(actions_dict, key=actions_dict.get)
        return action

    action = random.choice(actions)
    return action


def run_q_learning(q_func, env):
    times = 100000
    actions = env.action_values()
    episodes = 1
    total_return = 0
    state = env.reset()
    for i in range(times):
        action = epsilon_greedy(q_func, state, actions)
        next_state, reward, done = env.step(action)

        # Learning
        if not done:
            actions_dict = {action: q_func[next_state[0], next_state[1], action] for action in actions}
            action = max(actions_dict, key=actions_dict.get)
            max_q = q_func[next_state[0], next_state[1], action]
            new_q = q_func[state[0], state[1], action] + ALPHA * (reward + GAMMA * max_q - q_func[state[0], state[1], action])
            q_func[state[0], state[1], action] = new_q
        else:
            new_q = q_func[state[0], state[1], action] + ALPHA * (reward - q_func[state[0], state[1], action])
            q_func[state[0], state[1], action] = new_q

        # render + metrics
        if i % 20 == 0 or episodes > 250:
            env.render()

        total_return += reward
        print(f'\r[iter {i}, episode {episodes} step {env.last_counter}] state: {state}, reward: {total_return}, done: {done}',
              end='')

        if not done:
            state = next_state
        else:
            # End of episode
            episodes += 1
            total_return = 0
            state = env.reset()
            print()

Credits

Sutton and Barto - RL: An Introduction

Name		Name	Last commit message	Last commit date
Latest commit History 6 Commits
.gitignore		.gitignore
README.md		README.md
impl_env_windy_gridworld.py		impl_env_windy_gridworld.py
impl_q_learning.py		impl_q_learning.py
impl_sarsa.py		impl_sarsa.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

.gitignore

.gitignore

README.md

README.md

impl_env_windy_gridworld.py

impl_env_windy_gridworld.py

impl_q_learning.py

impl_q_learning.py

impl_sarsa.py

impl_sarsa.py

Repository files navigation

Learning TD-Learning

SARSA

Q-Learning

Credits

About

Releases

Packages

Languages

Arseni1919/Learning_TD_Learning

Folders and files

Latest commit

History

Repository files navigation

Learning TD-Learning

SARSA

Q-Learning

Credits

About

Resources

Stars

Watchers

Forks

Languages