
# Policy-Based REINFORCE method

In [1]:
import os
import sys
os.chdir("/Users/davidamat/Documents/projects/2048/src")

In [2]:
sys.path.remove('/Users/davidamat/Documents/projects/2048/src')
sys.path.remove('/Users/davidamat/Documents/projects/2048/src')
sys.path.insert(0, '/Users/davidamat/Documents/projects/2048')

In [3]:
import numpy as np
import warnings
import torch
import time
import torch.optim as optim
import torch.nn.functional as F
from tensorboardX import SummaryWriter

import src.common.constants as c
from src.env import Env
from src.model import Model
from src.agent import PolicyAgent
from src.epsilon import EpsilonPolicy
from src.experience import ExperienceSource
from src.common.utils import QValueCalc

warnings.filterwarnings("ignore")

In [4]:
# Game initialize
env = Env(c.GRID_LEN)
eps = EpsilonPolicy(eps_start=0.8, eps_decay=200, eps_final=0.01)
input_size = c.BINARY_POSITIONS * c.GRID_LEN**2 if c.STATE_REPR == "bin" else c.GRID_LEN**2
model = Model(input_size, len(env.actions))
agent = PolicyAgent(model=model, num_actions=len(env.actions), state_repr=c.STATE_REPR)
exp = ExperienceSource(env, agent, eps)
qv = QValueCalc()

# Training
version = "REINF-v7-log2"
writer = SummaryWriter(comment=f"-2048-{version}", log_dir=f"runs/{version}")
c.LEARNING_RATE = 0.0005 
optimizer = optim.Adam(model.parameters(), lr=c.LEARNING_RATE)
#optimizer = optim.RMSprop(model.parameters(), lr=c.LEARNING_RATE, alpha=0.99)

# Log
game_scores = []  # scores for each episode
steps_reach = []  # steps reached for each episode
game_wins = []  # whether 0: game lost, 1: game won

# Counters
step_idx = 0
done_episodes = 0
epoch_idx = 0
mean_wins = 0

In [5]:
################
#   Epochs
################

while epoch_idx < c.EPOCHS:

    # Control
    start_time = time.time()

    # For each step in the episode, keep track also of states, actions, rewards -> qvals
    batch_states, batch_actions, batch_rewards, batch_transf_states = [], [], [], []
    batch_rw_last_steps = []

    ###############
    # Batchs
    ###############
    # Play several games with the same policy
    batch_episodes = 0

    # For each batch
    for batch_id in range(c.BATCHS):

        # Generate a episode
        model.eval()
        exp.populate_episode(epoch_idx)
        rw_last_steps = []

        # Iterate through episode
        for idx, exp_step in enumerate(exp.history):

            # Ignore unfeasible moves
            #if not exp_step.done:
            #    continue

            # Fill with experience data
            batch_states.append(exp_step.state)
            batch_transf_states.append(agent.preprocess(exp_step.state).data.numpy())  # save as numpy the transformed game matrix
            batch_actions.append(int(exp_step.action))
            batch_rewards.append(exp_step.reward)
            rw_last_steps.append(exp_step.reward)

        # standarize and convert rewards to q values according to REINFORCE
        #st_rew = np.round((np.array(batch_rewards) - np.mean(batch_rewards)) / (np.std(batch_rewards)), 3)
        batch_qvals = qv(np.array(batch_rewards), c.GAMMA)
        batch_rw_last_steps.append(np.mean(rw_last_steps[-10:]))

        # Get last step number
        steps = len(exp.history)
        steps_reach.append(steps)

        # Get the final score in the episode
        game_score_final = exp.env.game_score
        game_scores.append(game_score_final)

        # Get if the game was won (1) or not (0)
        game_stat_final = 0 if exp.env.game_stat == -1 else 1
        game_wins.append(game_stat_final)

        # Reset the board to play another episode
        # inside this batch (we play BATCHS episodes in this batch)
        exp.reset()

    # Inform Tensorboard
    mean_game_scores = float(np.mean(game_scores[-c.BATCHS:]))
    mean_wins = np.round(float(np.mean(game_wins[-c.BATCHS:])) ,3)
    mean_steps = np.round(float(np.mean(steps_reach[-c.BATCHS:])) ,3)
    mean_last_steps_rew = np.round(float(np.mean(batch_rw_last_steps[-c.BATCHS:])) ,3)
    writer.add_scalar("mean_game_scores", mean_game_scores, epoch_idx)
    writer.add_scalar("mean_wins", mean_wins, epoch_idx)
    writer.add_scalar("mean_steps", mean_steps, epoch_idx)
    writer.add_scalar("mean_last_steps_rew", mean_last_steps_rew, epoch_idx)

    # When the problem is solved stop training
    if (mean_wins > c.GAME_WIN_RATE) & (epoch_idx > 20):
        break

    ##############################
    # Training neural network
    ##############################
    optimizer.zero_grad()
    model.train()

    # Converting to tensors the matrices of each observation in the episode
    # ----------------------------------------------------------------------
    # shape: [# steps, c.GRID_LEN, c.GRID_LEN]
    tensor_states = torch.FloatTensor(batch_transf_states)

    # shape [# steps]
    tensor_actions = torch.LongTensor(batch_actions)
    tensor_qvals = torch.FloatTensor(batch_qvals)

    # Forward to the network to get logits
    # we will forward tensor states with the following shape
    # [#steps, c.GRID_LEN * c.GRID_LEN]
    logits = model(tensor_states.view(-1, input_size))

    # Convert logits to log_softmax
    log_softmax = F.log_softmax(logits, dim=1)

    # From the probabilities got, mask with the actions taken
    # log_softmax is [#steps in game, 4 (actions)] so we will
    # convert it to [# steps, 1 (action taken)]
    log_softmax_action = log_softmax.gather(1, tensor_actions.unsqueeze(1)).squeeze(1)

    # The loss will be the weighted sum over steps in the episode
    # of the Q values (tensor_qvals) weighting the log(policy(s,a))
    # which is the log_softmax_action
    loss = -tensor_qvals * log_softmax_action
    loss_mean = loss.mean()
    writer.add_scalar("loss", np.round(loss_mean.item(), 4), epoch_idx)

    # Backpropagate
    loss_mean.backward()
    optimizer.step()

    # Control
    end_time = time.time()

    if ((epoch_idx % 1) == 0) & (epoch_idx > 0):
        print("Epoch: ", epoch_idx,
              ", Game_scores_mean: ", mean_game_scores,
              ", Mean reward: ", np.round(np.mean(batch_rewards), 2),
              ", Mean wins: ", mean_wins,
              ", Mean steps: ", mean_steps,
              #", Exec time epoch: ", round(end_time-start_time, 2),
              ", Mean last steps: ", mean_last_steps_rew,
              ", Epsilon: ", np.round(eps.get_epsilon(epoch_idx),3)
              )

    # Reset the experience source and add epoch counter
    exp.reset()
    epoch_idx += 1
    

writer.close()

TypeError: slice indices must be integers or None or have an __index__ method

In [5]:
env = Env(c.GRID_LEN)

In [17]:
class QValueCalc:
    def __init__(self):
        pass
    
    def __call__(self, rew, win = 10):
        """
        Lineal gamma discount on future state rewards
        """
        if not isinstance(rew, np.ndarray):
            rew = np.array(rew)
        gamma_vec = np.array([(win - idx)  / win for idx in range(win)])
        result = []
        for idx in range(rew.shape[0]):
            rew_win = rew[idx:idx+win]
            result.append(np.round(np.sum(rew_win*gamma_vec[:rew_win.shape[0]]),3))
        return result

    def old(self, rewards, gamma):
        """
        Calculates the discounted total reward for every step
        rewards: list of rewards for the whole episodes
        """
        res = []
        sum_r = 0.0

        # Calculate first the reward from the end of the local reward list
        for r in reversed(rewards):
            # The more far apart we are from the last step reward, the more discounted the reward
            sum_r *= gamma

            # local reward at that timestep
            sum_r += r
            res.append(sum_r)

        # reverse again the resulting q-vals list
        return list(reversed(res))


In [18]:
qv2 = QValueCalc()

In [7]:
batch_qvals = qv(np.array(batch_rewards), c.GAMMA)

In [19]:
qv2(batch_rewards)[:10]

[33.184, 38.326, 47.069, 51.425, 39.842, 56.531, 58.31, 60.49, 31.445, 29.733]

In [14]:
np.array(batch_rewards[:10])

array([ 2.        ,  0.        ,  4.61538462, 17.23076923, -8.        ,
        8.        ,  6.90909091, 34.66666667,  6.66666667, 17.33333333])

In [15]:
win = 10
gamma_vec = np.array([(win - idx)  / win for idx in range(win)])

In [16]:
np.sum(gamma_vec * np.array(batch_rewards[:10]))

33.18414918414918

In [9]:
batch_rewards

[2.0,
 0.0,
 4.615384615384615,
 17.23076923076923,
 -8.0,
 8.0,
 6.909090909090909,
 34.666666666666664,
 6.666666666666666,
 17.333333333333332,
 -16.0,
 16.0,
 6.909090909090909,
 -16.0,
 22.4,
 18.90909090909091,
 0.0,
 0.0,
 0.0,
 12.0,
 36.0,
 24.0,
 12.0,
 28.0,
 0.0,
 32.0,
 -32.0,
 44.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 14.666666666666666,
 0.0,
 14.666666666666666,
 37.33333333333333,
 25.142857142857142,
 41.14285714285714,
 82.28571428571428,
 0.0,
 54.666666666666664,
 0.0,
 25.333333333333332,
 25.333333333333332,
 29.333333333333332,
 37.33333333333333,
 0.0,
 29.6,
 29.6,
 108.8,
 0.0,
 116.0,
 24.0,
 0.0,
 0.0,
 54.666666666666664,
 26.285714285714285,
 0.0,
 0.0,
 100.8,
 26.285714285714285,
 34.285714285714285,
 0.0,
 25.333333333333332,
 0.0,
 120.8,
 -41.714285714285715,
 26.285714285714285,
 98.28571428571428,
 82.28571428571428,
 22.285714285714285,
 36.57142857142857,
 44.57142857142857,
 0.0,
 128.0,
 -128.0,
 68.0,
 72.0,
 68.0,
 0.0,
 128.0,
 89.33333333333333,

In [8]:
batch_qvals

[14.466576442674514,
 17.80939491810645,
 25.441992740152074,
 29.75229732109637,
 17.88789727189592,
 36.982710388422745,
 41.40387198346107,
 49.278258677671666,
 20.873702872864293,
 20.29576600885375,
 4.232046679314884,
 28.90292382759269,
 18.432748325132415,
 16.46236773720215,
 46.37481105314593,
 34.24973007592276,
 21.915198809759794,
 31.307426871085422,
 44.72489553012203,
 63.89270790017434,
 74.13243985739192,
 54.474914081988445,
 43.535591545697784,
 45.050845065282545,
 24.358350093260785,
 34.79764299037255,
 3.9966328433893636,
 51.42376120484195,
 10.605373149774223,
 15.150533071106034,
 21.64361867300862,
 30.919455247155177,
 44.17065035307883,
 63.1009290758269,
 69.19180344165748,
 98.84543348808212,
 120.25538117345064,
 118.46006834302474,
 133.31030171452517,
 131.66777795952578,
 70.54580524830214,
 100.77972178328879,
 65.8757930237459,
 94.10827574820844,
 98.24991773553587,
 104.16654914600365,
 106.90459401810045,
 99.38751526395305,
 141.98216466279007

In [6]:
env.reset()

In [7]:
env.matrix

array([[0., 0., 2., 0.],
       [0., 0., 0., 0.],
       [4., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [12]:
env.step(3)
env.step(3)
env.step(2)
env.step(3)
env.step(3)
env.step(3)
env.step(2)
env.step(3)
env.step(1)
env.step(1)
env.step(3)
env.step(3)
env.step(2)
env.step(3)
env.step(1)
env.step(1)
env.step(1)
env.step(1)

(array([[ 0.,  0.,  4.,  0.],
        [ 0.,  0.,  2.,  8.],
        [ 4., 16., 64., 32.],
        [ 8., 32.,  8.,  2.]]),
 array([[ 0.,  0.,  4.,  0.],
        [ 0.,  0.,  2.,  8.],
        [ 4., 16., 64., 32.],
        [ 8., 32.,  8.,  2.]]),
 False,
 0.0,
 0)

In [62]:
import copy
bk = copy.copy(env.matrix)
env.matrix

array([[128.,   2.,   8.,   4.],
       [ 64.,  32.,   2.,   0.],
       [  4.,   8.,   0.,   0.],
       [  0.,   2.,   0.,   0.]])

In [44]:
env.actions

{0: <function src.common.movements.Movements.up(game, added_merge=0)>,
 1: <function src.common.movements.Movements.down(game, added_merge=0)>,
 2: <function src.common.movements.Movements.left(game, added_merge=0)>,
 3: <function src.common.movements.Movements.right(game, added_merge=0)>}

In [64]:
env.step(0)

(array([[  0.,   2.,   2.,   0.],
        [128.,  32.,   0.,   0.],
        [ 64.,   8.,   8.,   0.],
        [  4.,   2.,   2.,   4.]]),
 array([[128.,   2.,   2.,   4.],
        [ 64.,  32.,   8.,   0.],
        [  4.,   8.,   2.,   0.],
        [  2.,   2.,   0.,   0.]]),
 True,
 128.0,
 0)

In [27]:
pos = np.where(env.matrix == env.matrix.max())

In [56]:
env.mask_position_val

array([[-1.,  0.,  0., -1.],
       [ 0.,  1.,  1.,  0.],
       [ 0.,  1.,  1.,  0.],
       [-1.,  0.,  0., -1.]])

In [57]:
env._calc_penalty_max_position(m1, env.mask_position_val)

-128.0

In [60]:
env._calc_penalty_max_position(m2[0], env.mask_position_val)

0.0

In [28]:
pos = [xx[0] for xx in pos]

In [29]:
pos

[2, 2]

In [67]:
32*2*2 / (6)

21.333333333333332

In [14]:
env.mask_position_val = np.ones((c.GRID_LEN,c.GRID_LEN))
for pp in [-1,0]:
    env.mask_position_val[:,pp] -= 1
    env.mask_position_val[pp,:] -= 1

In [15]:
env.mask_position_val

array([[-1.,  0.,  0., -1.],
       [ 0.,  1.,  1.,  0.],
       [ 0.,  1.,  1.,  0.],
       [-1.,  0.,  0., -1.]])

In [17]:
game_scores[-c.BATCHS:]

[1944.0,
 684.0,
 1008.0,
 896.0,
 652.0,
 576.0,
 532.0,
 680.0,
 536.0,
 940.0,
 1064.0,
 1020.0,
 908.0,
 428.0,
 428.0,
 644.0,
 608.0,
 992.0,
 1512.0,
 676.0,
 1088.0,
 1872.0,
 1656.0,
 1048.0,
 320.0]

In [12]:
batch_rw_last_steps

[]

In [11]:
mean_last_steps_rew

nan

In [57]:
env = Env(c.GRID_LEN)


In [59]:
env.actions

{0: <function src.common.movements.Movements.up(game, added_merge=0)>,
 1: <function src.common.movements.Movements.down(game, added_merge=0)>,
 2: <function src.common.movements.Movements.left(game, added_merge=0)>,
 3: <function src.common.movements.Movements.right(game, added_merge=0)>}

In [92]:
env.matrix

array([[ 2.,  0.,  4.,  0.],
       [ 2., 16.,  2.,  0.],
       [ 8.,  8.,  8.,  0.],
       [ 8., 64., 16.,  4.]])

In [91]:
env.step(1)

(array([[ 2., 16.,  4.,  0.],
        [ 4.,  8.,  2.,  0.],
        [ 4., 32.,  8.,  2.],
        [ 8., 32., 16.,  2.]]),
 array([[ 2.,  0.,  4.,  0.],
        [ 2., 16.,  2.,  0.],
        [ 8.,  8.,  8.,  0.],
        [ 8., 64., 16.,  4.]]),
 True,
 -2.752072486556415,
 0)

4

In [74]:
env._calc_penalty_max_position()

16.0

In [75]:
np.log2(16)

4.0

In [19]:
m1 = np.array([[ 2.,  0.,  0.,  4.],
        [ 2.,  4.,  2.,  4.],
        [ 0.,  8., 32.,  8.],
        [ 4.,  8.,  2., 16.]])
m1

array([[ 2.,  0.,  0.,  4.],
       [ 2.,  4.,  2.,  4.],
       [ 0.,  8., 32.,  8.],
       [ 4.,  8.,  2., 16.]])

In [20]:
m2 = np.array([[ 4.,  4.,  2.,  8.],
        [ 4., 16., 32.,  8.],
        [ 0.,  0.,  2., 16.],
        [ 0.,  0.,  2.,  0.]])
m2

array([[ 4.,  4.,  2.,  8.],
       [ 4., 16., 32.,  8.],
       [ 0.,  0.,  2., 16.],
       [ 0.,  0.,  2.,  0.]])

In [36]:
pos = np.where(m2 == m2.max())

In [41]:
pos

[1, 2]

array([[0. , 0.5, 0.5, 0. ],
       [0.5, 1. , 1. , 0.5],
       [0.5, 1. , 1. , 0.5],
       [0. , 0.5, 0.5, 0. ]])

In [55]:
np.max(m2) * mask_position_val[pos[0], pos[1]]

32.0

In [22]:
(m2 - m1) / np.max(m2)

array([[ 0.0625,  0.125 ,  0.0625,  0.125 ],
       [ 0.0625,  0.375 ,  0.9375,  0.125 ],
       [ 0.    , -0.25  , -0.9375,  0.25  ],
       [-0.125 , -0.25  ,  0.    , -0.5   ]])

In [16]:
penalty_cell_move = c.GRID_LEN**2 - np.sum(m1 == m2)
penalty_cell_move

14

In [17]:
collapsed_cells = np.log2(28) if 28>0 else 0
collapsed_cells

4.807354922057604

In [15]:
env.actions[0](m1)

(array([[ 4.,  4.,  2.,  8.],
        [ 4., 16., 32.,  8.],
        [ 0.,  0.,  2., 16.],
        [ 0.,  0.,  0.,  0.]]),
 True,
 3,
 28.0)

In [18]:
14 - 6

8