In [568]:
import game
from game import Game
import random
import numpy as np
import pandas as pd
import collections
from IPython.display import clear_output
from importlib import reload
import torch
from torch import nn
from torch.optim import Adam
from torch.distributions import Categorical
import matplotlib.pyplot as plt
%matplotlib inline
game = reload(game)

In [569]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #позволяет перенести тензор на GPU, если он доступен в системе

In [570]:
moving_average = lambda x, **kw: pd.DataFrame({'x':np.asarray(x)}).x.ewm(**kw).mean().values

In [571]:
class Agent(nn.Module):
    def __init__(self, state_size, act_size, hidden_size):
        super(Agent, self).__init__()
        self.linear1 = nn.Linear(state_size, hidden_size)
        self.relu1 = nn.Sigmoid()
        self.act = nn.Linear(hidden_size, act_size)
        self.relu2 = nn.Sigmoid()
        self.valid = nn.Linear(hidden_size, 1)
        self.sm = nn.Softmax(dim=-1)
        self.loss = nn.MSELoss()
        
    def forward(self, X, discount_rewards):
        linear1_out = self.linear1(X)
        act = self.act(self.relu1(linear1_out))
        prob = self.sm(act)
        rewards = self.valid(self.relu2(linear1_out))
        
        return prob, self.loss(rewards, discount_rewards)


In [572]:
def process_traj_batch(agent, batch, discount):
    log_probs = []
    returns = torch.Tensor([]).to(device)
    
    for episode in batch:
        for numb, step in enumerate(episode):
            discount_rewards = sum([discount ** i * reward for i, (*_, reward) in enumerate(episode[numb:])])
            probs, rewards = agent(step.state, torch.FloatTensor([discount_rewards]).to(device))
            returns = torch.cat((returns, rewards.reshape(1)))
            dist = Categorical(probs)
            log_probs.append((dist.log_prob(step.action)))
    return log_probs, returns

In [573]:
# a = torch.Tensor([]).to(device)
# env = Game()
# state = env.reset()
# state = torch.FloatTensor(state).to(device)
# probs, r = agent.forward(state, torch.Tensor([0]).to(device))
# action = torch.argmax(probs)
# previos_state = state
# done, state, reward, = env.step(action.item())
# rewards += reward
# traj.append(transition(previos_state, action, reward))

In [574]:
# r = r.reshape(1)
# a = torch.cat((a, r))
# print(a)
# print(r)

In [575]:
STATE_SIZE  = 4 # размерность пространства состояний среды
ACT_SIZE = 3 # размерность пространства действий среды
HIDDEN_SIZE = 512 # размер скрытого слоя для политики
NUM_EPISODES = 500 # количество эпиздов, которые будут сыграны для обучения
DISCOUNT = 0.5 # фактор дисконтирования
TRAIN_EVERY = 50

In [576]:
agent = Agent(STATE_SIZE, ACT_SIZE, HIDDEN_SIZE).to(device)
optimizer = Adam(agent.parameters(), lr=1e-6)

In [577]:
transition = collections.namedtuple("transition", ["state", "action", "reward"])

In [578]:
# a = torch.tensor([ 0.1, 0.3, 0.4, 0.25 ])
# b = torch.argmax(a)
# print(b.item())
# m = Categorical(a)
# print(m.sample().item())
# print(m.logits)

In [579]:
returns_history = []
traj_batch = []
done=False
env = Game()
best_model = None
best_rewards = 0
queue = True
for i in range(NUM_EPISODES):
  state = env.reset()
  traj=[]
  rewards=0
  while not done:
      state = torch.FloatTensor(state).to(device)
      probs, _ = agent.forward(state, torch.Tensor([0]).to(device))
      # dist = Categorical(probs)
      # action = dist.sample()
      action = torch.argmax(probs)
      previos_state = state
      done, state, reward, = env.step(action.item())
      rewards += reward
      traj.append(transition(previos_state, action, reward))
  done=False
  returns_history.append(rewards)
  
  if rewards > best_rewards:
      best_model = agent
      best_rewards = rewards
  traj_batch.append(traj)

  
      
  if i % TRAIN_EVERY:
      log_probs, returns = process_traj_batch(agent, traj_batch, DISCOUNT)
      if queue:
          queue = False
          loss = returns.sum()
      else:
          queue = True
          loss = -(torch.stack(log_probs) * returns.detach()).sum()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      traj_batch = []

  if i % 10:
      clear_output(True)
      plt.figure(figsize=[12, 6])
      plt.title('Returns'); plt.grid()
      plt.scatter(np.arange(len(returns_history)), returns_history, alpha=0.1)
      plt.plot(moving_average(returns_history, span=10, min_periods=10))
      plt.show()

KeyboardInterrupt: 

In [None]:
def test_agent(env, agent=None, n_episodes=100):
    """Runs agent for n_episodes in environment and calclates mean reward.
    
    Args:
        env: The environment for agent to play in
        agent: The agent to play with. Defaults to None - 
            in this case random agent is used.
        n_episodes: Number of episodes to play. Defaults to 100.

    Returns:
        Mean reward for 100 episodes.
    """
    total_reward = []
    for episode in range(n_episodes):
        episode_reward = 0
        observation = env.reset()
        t = 0
        while True:
            if agent:
                with torch.no_grad():
                    probs, _ = agent(torch.FloatTensor(observation).to(device), torch.Tensor([0]).to(device))
                    dist = Categorical(probs)
                    action = dist.sample().item()
            else:
                action = random.randint(0, 2)
            done, observation, reward,  = env.step(action)
            episode_reward += reward
            t += 1
            if done:
                print("Episode {} finished after {} timesteps".format(episode+1, t+1))
                break
        total_reward.append(episode_reward)
    env.stop()
                   
    return np.mean(total_reward)   

In [None]:
env = Game(drawing=True)
test_agent(env, agent=agent, n_episodes=5)