
# Policy-Based REINFORCE method

In [1]:
import os
import sys
os.chdir("/Users/davidamat/Documents/projects/2048/src")

In [2]:
sys.path.remove('/Users/davidamat/Documents/projects/2048/src')
sys.path.remove('/Users/davidamat/Documents/projects/2048/src')
sys.path.insert(0, '/Users/davidamat/Documents/projects/2048')

In [3]:
import numpy as np
import warnings
import torch
import time
import torch.optim as optim
import torch.nn.functional as F
from tensorboardX import SummaryWriter

import src.common.constants as c
from src.env import Env
from src.model import Model
from src.agent import PolicyAgent
from src.epsilon import EpsilonPolicy
from src.experience import ExperienceSource
from src.common.utils import QValueCalc

warnings.filterwarnings("ignore")

In [4]:
# Game initialize
env = Env(c.GRID_LEN)
eps = EpsilonPolicy(eps_start=0.8, eps_decay=200, eps_final=0.01)
input_size = c.BINARY_POSITIONS * c.GRID_LEN**2 if c.STATE_REPR == "bin" else c.GRID_LEN**2
model = Model(input_size, len(env.actions))
agent = PolicyAgent(model=model, num_actions=len(env.actions), state_repr=c.STATE_REPR)
exp = ExperienceSource(env, agent, eps)
qv = QValueCalc()

# Training
version = "REINF-v1-log2"
writer = SummaryWriter(comment=f"-2048-{version}", log_dir=f"runs/{version}")
c.LEARNING_RATE = 0.
optimizer = optim.Adam(model.parameters(), lr=c.LEARNING_RATE)
#optimizer = optim.RMSprop(model.parameters(), lr=c.LEARNING_RATE, alpha=0.99)

# Log
game_scores = []  # scores for each episode
steps_reach = []  # steps reached for each episode
game_wins = []  # whether 0: game lost, 1: game won

# Counters
step_idx = 0
done_episodes = 0
epoch_idx = 0
mean_wins = 0

In [11]:
################
#   Epochs
################

while epoch_idx < c.EPOCHS:

    # Control
    start_time = time.time()

    # For each step in the episode, keep track also of states, actions, rewards -> qvals
    batch_states, batch_actions, batch_rewards, batch_transf_states = [], [], [], []

    ###############
    # Batchs
    ###############
    # Play several games with the same policy
    batch_episodes = 0

    # For each batch
    for batch_id in range(c.BATCHS):

        # Generate a episode
        model.eval()
        exp.populate_episode(epoch_idx)
        

        # Iterate through episode
        for idx, exp_step in enumerate(exp.history):

            # Ignore unfeasible moves
            #if not exp_step.done:
            #    continue

            # Fill with experience data
            batch_states.append(exp_step.state)
            batch_transf_states.append(agent.preprocess(exp_step.state).data.numpy())  # save as numpy the transformed game matrix
            batch_actions.append(int(exp_step.action))
            batch_rewards.append(exp_step.reward)

        # standarize and convert rewards to q values according to REINFORCE
        st_rew = np.round((np.array(batch_rewards) - np.mean(batch_rewards)) / (np.std(batch_rewards)), 3)
        batch_qvals = qv(st_rew, c.GAMMA)

        # Get last step number
        steps = len(exp.history)
        steps_reach.append(steps)

        # Get the final score in the episode
        game_score_final = exp.env.game_score
        game_scores.append(game_score_final)

        # Get if the game was won (1) or not (0)
        game_stat_final = 0 if exp.env.game_stat == -1 else 1
        game_wins.append(game_stat_final)

        # Reset the board to play another episode
        # inside this batch (we play BATCHS episodes in this batch)
        exp.reset()

        # Inform Tensorboard
    mean_game_scores = float(np.mean(game_scores[-c.BATCHS:]))
    mean_wins = np.round(float(np.mean(game_wins[-c.BATCHS:])) ,3)
    mean_steps = np.round(float(np.mean(steps_reach[-c.BATCHS:])) ,3)
    writer.add_scalar("mean_game_scores", mean_game_scores, epoch_idx)
    writer.add_scalar("mean_wins", mean_wins, epoch_idx)
    writer.add_scalar("mean_steps", mean_steps, epoch_idx)

    # When the problem is solved stop training
    if (mean_wins > c.GAME_WIN_RATE) & (epoch_idx > 20):
        break

    ##############################
    # Training neural network
    ##############################
    model.train()
    optimizer.zero_grad()

    # Converting to tensors the matrices of each observation in the episode
    # ----------------------------------------------------------------------

    # shape: [# steps, c.GRID_LEN, c.GRID_LEN]
    tensor_states = torch.FloatTensor(batch_transf_states)

    # shape [# steps]
    tensor_actions = torch.LongTensor(batch_actions)
    tensor_qvals = torch.FloatTensor(batch_qvals)

    # Forward to the network to get logits
    # we will forward tensor states with the following shape
    # [#steps, c.GRID_LEN * c.GRID_LEN]
    logits = model(tensor_states.view(-1, input_size))

    # Convert logits to log_softmax
    log_softmax = F.log_softmax(logits, dim=1)

    # From the probabilities got, mask with the actions taken
    # log_softmax is [#steps in game, 4 (actions)] so we will
    # convert it to [# steps, 1 (action taken)]
    log_softmax_action = log_softmax.gather(1, tensor_actions.unsqueeze(1)).squeeze(1)

    # The loss will be the weighted sum over steps in the episode
    # of the Q values (tensor_qvals) weighting the log(policy(s,a))
    # which is the log_softmax_action
    loss = -tensor_qvals * log_softmax_action
    loss_mean = loss.mean()
    writer.add_scalar("loss", np.round(loss_mean.item(), 4), epoch_idx)

    # Backpropagate
    loss_mean.backward()
    optimizer.step()

    # Control
    end_time = time.time()

    if ((epoch_idx % 1) == 0) & (epoch_idx > 0):
        print("Epoch: ", epoch_idx,
              ", Game_scores_mean: ", mean_game_scores,
              ", Mean reward: ", np.round(np.mean(batch_rewards), 2),
              ", Mean wins: ", mean_wins,
              ", Mean steps: ", mean_steps,
              ", Exec time epoch: ", round(end_time-start_time, 2),
              ", Epsilon: ", np.round(eps.get_epsilon(epoch_idx),3)
              )

    # Reset the experience source and add epoch counter
    exp.reset()
    epoch_idx += 1
    

writer.close()



ValueError: probabilities contain NaN

In [8]:
agent = PolicyAgent(model=model, num_actions=len(env.actions), state_repr=c.STATE_REPR)

In [9]:
env.matrix

array([[0., 0., 0., 0.],
       [0., 0., 4., 0.],
       [0., 0., 0., 2.],
       [0., 0., 0., 0.]])

In [10]:
state = agent.preprocess(env.matrix).to("cpu")
state2 = state.unsqueeze(0)
state2

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.,

In [11]:
state3 = torch.cat((state2,state2), 0 )

In [12]:
np.median(env.matrix)

0.0

In [13]:
model.eval()

Model(
  (fc1): Linear(in_features=240, out_features=64, bias=True)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act3): ReLU()
  (fc4): Linear(in_features=64, out_features=4, bias=True)
)

In [14]:
model.fc1(state2)

tensor([[ 0.0619,  0.0249, -0.0974,  0.0548,  0.0385, -0.0394,  0.0965, -0.0233,
          0.0300, -0.1203,  0.0679, -0.0400,  0.0424,  0.0206,  0.0482, -0.0017,
         -0.0353, -0.0653,  0.0604, -0.0110, -0.0048, -0.0184,  0.0908, -0.0718,
          0.0397,  0.0171,  0.0490,  0.0237,  0.0588,  0.0685, -0.0597,  0.0605,
         -0.0485,  0.1150,  0.0003,  0.1036,  0.0406,  0.0079, -0.0672,  0.0654,
          0.0255,  0.0998,  0.0022, -0.0260,  0.0030,  0.0322, -0.0526, -0.0421,
          0.0676,  0.0125,  0.0959,  0.0512, -0.0255, -0.1473, -0.0494,  0.0667,
          0.0714, -0.1116,  0.0572, -0.0439, -0.1195,  0.0149,  0.0119, -0.0466]],
       grad_fn=<AddmmBackward>)

In [15]:
agent.get_action_probs_batch(state2)

array([[0.25419235, 0.2738103 , 0.22166967, 0.25032768]], dtype=float32)

In [126]:
agent.get_action_probs_batch(state3)

array([[0.2418167 , 0.25083888, 0.26148984, 0.24585463],
       [0.2418167 , 0.25083888, 0.26148984, 0.24585463]], dtype=float32)

In [99]:
probs = agent.get_action_probs(env.matrix)
probs

array([[0., 0., 0., 1.]], dtype=float32)

In [98]:
action_id = agent(env.matrix, eps.get_epsilon(400))
action_id

3

In [31]:
states = torch.cat((state.unsqueeze(0),state.unsqueeze(0),state.unsqueeze(0)), 0)

In [45]:
state = agent.preprocess(env.matrix)

In [46]:
agent.model.eval()

Model(
  (net): Sequential(
    (0): Linear(in_features=240, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=128, out_features=32, bias=True)
    (7): ReLU()
    (8): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): Linear(in_features=32, out_features=4, bias=True)
  )
)

In [47]:
state.unsqueeze(0).shape

torch.Size([1, 240])

In [48]:
agent.model(state.unsqueeze(0))

tensor([[-0.0759,  0.1209,  0.0951, -0.1528]], grad_fn=<AddmmBackward>)

In [36]:
F.softmax(agent.model(states), dim=1)

tensor([[0.2362, 0.2465, 0.2870, 0.2302],
        [0.2362, 0.2465, 0.2870, 0.2302],
        [0.2362, 0.2465, 0.2870, 0.2302]], grad_fn=<SoftmaxBackward>)

In [20]:
state.shape

torch.Size([240])

In [25]:
.shape

torch.Size([1, 240])

In [34]:
#agent.model(state.unsqueeze(0))

In [50]:
agent.model(state)

tensor([ 6578.4878, -6711.9526, -6119.0659, -5923.6694],
       grad_fn=<AddBackward0>)

In [51]:
F.softmax(agent.model(state), dim=0)

tensor([1., 0., 0., 0.], grad_fn=<SoftmaxBackward>)

In [102]:
w_params = {}
cc = 0
for pp in model.parameters():
    cc += 1
    w_params[cc] = pp


In [107]:
w_params[3]

Parameter containing:
tensor([ 2.0606,  1.0165,  0.2747,  0.7981,  0.9662,  1.1002,  0.4950,  0.8974,
         0.6916,  1.0788,  1.0809,  0.5211,  0.3244,  1.0796,  1.1076,  1.2800,
         0.1578,  1.2693,  0.1017,  0.5033,  1.2008,  0.1457,  0.1646,  0.4053,
         0.3129,  1.2557,  0.8618,  0.5862,  1.1870,  0.9735,  1.0659,  0.5500,
        -0.0431,  1.2288,  0.4618,  1.1748,  1.0159,  0.8320,  0.6053,  0.4324,
         1.3057,  0.3396,  0.6723,  0.6246,  0.9610,  0.4004,  0.9570,  0.4326,
         0.4118,  0.7392,  0.9885,  0.2897,  0.1424,  1.7120,  0.3997,  1.2166,
         2.4558,  1.2201,  0.8548,  1.0597,  0.7209,  1.3356,  0.9146,  0.8685,
         1.3619,  1.0538,  1.0836,  1.0526,  0.7275,  0.7517,  0.4334,  0.2229,
         0.4063,  0.6379,  0.5772,  0.6810,  0.5858,  0.7950,  0.2810,  0.2224,
         0.1221,  0.8526,  0.2517,  1.0931,  0.8010,  1.4060,  0.8976,  0.1573,
         0.0058,  1.3235,  0.5768,  0.9079,  1.2512,  0.3211,  0.6734,  0.9175,
         0.4954,  

In [9]:
w_params[1].shape

torch.Size([50, 240])