## Домашнее задание : "Обучение с подкреплением"

ФИО: Лыжин Роман Денисович

# Задание 1

Обучите алгоритм Q-learning для сред FrozenLake-v1 и Blackjack-v1, в частности подберите оптимальную alpha. (2 балла)

In [12]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

In [13]:
def q_learning(env, num_episodes, alpha, gamma, epsilon):
    Q = np.zeros((env.observation_space.n, env.action_space.n))

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])

            next_state, reward, done, _ = env.step(action)

            best_next_action = np.argmax(Q[next_state])
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])

            state = next_state

    return Q

In [14]:
def optimize_alpha(env_name, alpha_values, num_episodes=1000, gamma=0.99, epsilon=0.1):
    env = gym.make(env_name)
    best_alpha = None
    best_reward = -float('inf')

    for alpha in alpha_values:
        Q = q_learning(env, num_episodes, alpha, gamma, epsilon)

        total_reward = 0
        for _ in range(100):
            state = env.reset()
            done = False
            while not done:
                action = np.argmax(Q[state])
                state, reward, done, _ = env.step(action)
                total_reward += reward

        avg_reward = total_reward / 100
        if avg_reward > best_reward:
            best_reward = avg_reward
            best_alpha = alpha

    return best_alpha, best_reward

In [15]:
alpha_values = np.linspace(0.01, 1.0, 20)
optimal_alpha_frozenlake, reward_frozenlake = optimize_alpha("FrozenLake-v1", alpha_values)

In [16]:
optimal_alpha_frozenlake

0.7394736842105263

In [17]:
reward_frozenlake

0.18

In [18]:
def q_learning_blackjack(env, num_episodes, alpha, gamma, epsilon):
    Q = {}

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            if state not in Q:
                Q[state] = np.zeros(env.action_space.n)

            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])

            next_state, reward, done, _ = env.step(action)

            if next_state not in Q:
                Q[next_state] = np.zeros(env.action_space.n)

            best_next_action = np.argmax(Q[next_state])
            Q[state][action] = Q[state][action] + alpha * (reward + gamma * Q[next_state][best_next_action] - Q[state][action])

            state = next_state

    return Q

In [19]:
def optimize_alpha_blackjack(alpha_values, num_episodes=1000, gamma=0.99, epsilon=0.1):
    env = gym.make("Blackjack-v1")
    best_alpha = None
    best_reward = -float('inf')

    for alpha in alpha_values:
        Q = q_learning_blackjack(env, num_episodes, alpha, gamma, epsilon)

        total_reward = 0
        for _ in range(100):
            state = env.reset()
            done = False
            while not done:
                action = np.argmax(Q.get(state, np.zeros(env.action_space.n)))
                state, reward, done, _ = env.step(action)
                total_reward += reward

        avg_reward = total_reward / 100
        if avg_reward > best_reward:
            best_reward = avg_reward
            best_alpha = alpha

    return best_alpha, best_reward

In [20]:
optimal_alpha_blackjack, reward_blackjack = optimize_alpha_blackjack(alpha_values)

In [21]:
optimal_alpha_blackjack

0.01

In [22]:
reward_blackjack

0.11

# Задание 2

Обучите алгоритм Policy Gradients (или Actor Critic) для среды https://www.gymlibrary.dev/environments/atari/breakout/ . Продемонстрируйте, что для обученного агента растет время игры. (3 балла)