## Vanilla NN with REINFORCE

Implementing a one hidden layer vanilla Neural Network that uses REINFORCE as the policy gradient method. The implementation will be on PyTorch instead of Tensorflow as people seem to believe it is a better framework. (I don't have any prior PyTorch experience so I am learning that on the go as well.

A typical training procedure for a neural network is as follows:

- Define the neural network that has some learnable parameters (or weights)
- Iterate over a dataset of inputs
- Process input through the network
- Compute the loss (how far is the output from being correct)
- Propagate gradients back into the network’s parameters
- Update the weights of the network, typically using a simple update rule: weight = weight - learning_rate * gradient

###### from [pytorch-tutorial](https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html)
###### [other sources](https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py)

In [3]:
import numpy as np
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os

import torch.distributions as TDist

In [4]:
# Function version
class Policy(nn.Module):
    
    def __init__(self, num_nodes = [4,128,2], add_dropouts = True, dropout_rate = 0.6):
        super(Policy, self).__init__()
        self.num_nodes = num_nodes
        self.add_dropouts = add_dropouts
        
        self.saved_log_probs = []
        self.rewards = []

        self.layers = nn.ModuleList()
        self.dropouts = nn.ModuleList()        
        
        self.layers.append(nn.Linear(self.num_nodes[0], self.num_nodes[1]))
        self.dropouts.append(nn.Dropout(p = dropout_rate))
        
        for i in range(1, len(num_nodes)-2):
            self.layers.append(nn.Linear(self.num_nodes[i], self.num_nodes[i+1]))
            self.dropouts.append(nn.Dropout(p = dropout_rate))
        
        self.layers.append(nn.Linear(self.num_nodes[-2], self.num_nodes[-1]))
        
    def forward(self, X):
        for layer, dropout in zip(self.layers[:-1], self.dropouts):
            if self.add_dropouts:
                X = F.relu(dropout(layer(X)))
            else:
                X = F.relu(layer(X))
  
        return F.softmax(self.layers[-1](X), dim=1)

In [5]:
class HexBoard:
    def __init__(self, BOARD_SIZE=[3, 3]):
        self.BOARD_SIZE = BOARD_SIZE
        self.BOARD = [['.' for __ in range(self.BOARD_SIZE[0])] for _ in range(self.BOARD_SIZE[1])]
        # self.BOARD = [
        #     ['B','W','W'],
        #     ['W','W','B'],
        #     ['B','W','B'],
        # ]
        self.done = False # game is over or not

    # RUNNER FUNCTIONS ____________
    # _____________________________
    # _____________________________
    # _____________________________

    def step(self, color, action):
        # color = 'B' or 'W'
        # action = [x, y]
        try:
            input_err = self.placeStone(action, color) # False if there is an error in the input
            result = self.check_game_status()
        except Exception:
            return 0, 0, 0, 0, False
        # reward system: win +1 / loss -1
        if result == color:
            reward = 1
        elif result == '=':
            reward = 0
        else:
            reward = -1
        return self.BOARD, self.done, result, reward, input_err

    # HELPER FUNCTIONS ____________
    # _____________________________
    # _____________________________
    # _____________________________

    def checkEdge(self, color, node):
        if color == 'W' and node[1] == self.BOARD_SIZE[1]-1:
            return True
        if color == 'B' and node[0] == self.BOARD_SIZE[0]-1:
            return True
        return False
                
    def testConnections(self, cellToCheck):
        print('connections are', self.cell_connections(cellToCheck))

    def printBoard(self):
        for i in range(self.BOARD_SIZE[0]):
            print('  '*(self.BOARD_SIZE[0]-i-1), end='')
            for j in range(self.BOARD_SIZE[1]):
                print(self.BOARD[i][j], end=' ')
            print('')

    def placeStone(self, cell, color):
        if self.BOARD[cell[0]][cell[1]] != '.':
            print('Invalid Action')
            return False
        self.BOARD[cell[0]][cell[1]] = color
        return True

    def cell_connections(self, cell):
        row = cell[0] 
        col = cell[1]

        positions = []
        
        if col + 1 < self.BOARD_SIZE[1]:
            positions.append([row, col + 1])
        if col - 1 >= 0:
            positions.append([row, col - 1])
        if row + 1 < self.BOARD_SIZE[0]:
            positions.append([row + 1, col])
            if col + 1 < self.BOARD_SIZE[1]:
                positions.append([row + 1, col + 1])
        if row - 1 >= 0:
            positions.append([row - 1, col])
            if col - 1 >= 0:
                positions.append([row - 1, col - 1])
        
        return positions
    
    def check_game_status(self):
        # checking for white
        self.CHECK_BOARD = [[False for __ in range(self.BOARD_SIZE[0])] for _ in range(self.BOARD_SIZE[1])] 
        for i in range(self.BOARD_SIZE[0]):
            if self.BOARD[i][0] == 'W':
                self.CHECK_BOARD[i][0] = True
                self.check_connections(self.cell_connections([i, 0]), 'W')
                if self.done:
                    return 'W'
        # checking for black
        self.CHECK_BOARD = [[False for __ in range(self.BOARD_SIZE[0])] for _ in range(self.BOARD_SIZE[1])] 
        for i in range(self.BOARD_SIZE[1]):
            if self.BOARD[0][i] == 'B':
                self.CHECK_BOARD[0][i] = True
                self.check_connections(self.cell_connections([0, i]), 'B')
                if self.done:
                    return 'B'
        return '='

    def check_connections(self, connections, color):
        for c in connections:
            if self.BOARD[c[0]][c[1]] == color and not self.CHECK_BOARD[c[0]][c[1]]:
                # print(c[0], c[1], 'visited')
                if self.checkEdge(color, c):
                    self.done = True
                    return
                self.CHECK_BOARD[c[0]][c[1]] = True
                self.check_connections(self.cell_connections([c[0], c[1]]), color)

In [6]:
def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    
    # Discounted reward
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    
    returns = torch.tensor(returns)
    
    # Normalized discounted rewards
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Policy losses saved -Log * R - Reinforce Formula
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    
    # reset the grad log
    optimizer.zero_grad()
    
    # concatinate the policy losees and sum
    policy_loss = torch.cat(policy_loss).sum()
    
    # gradients taken
    policy_loss.backward()
    
    # Policy Updated
    optimizer.step()
    
    # Logs deleted for next try
    del policy.rewards[:]
    del policy.saved_log_probs[:]

In [7]:
def select_action(state, policy):
    # Flattening
    state = torch.from_numpy(state).float().unsqueeze(0) 
    
    # probabilities given policy
    probs = policy(state) 

    # get the distribution according to the probs of each value
    m = TDist.Categorical(probs) 
    # get a sample by given probs
    action = m.sample() 

    # save the log_prob of the action
    policy.saved_log_probs.append(m.log_prob(action))
    # Why are we keeping the log prob ?
    
    return action.item()

In [None]:
if __name__ == '__main__':
#     env = gym.make('LunarLander-v2')
    env = HexBoard()

#     filepath = 'model/'
#     if not os.path.exists(filepath):
#         os.makedirs(filepath)
    
    num_of_episodes = 100
    num_of_steps = 10

    running_reward = 1

    log_interval = 100
    gamma = 0.99

#     num_of_nodes = [env.observation_space.shape[0], 128,env.action_space.n]
    num_of_nodes = [env.]

    policy = Policy(num_of_nodes, add_dropouts=True)
    optimizer = optim.Adam(policy.parameters(), lr=0.01)
    eps = np.finfo(np.float32).eps.item()

    for ep in range(num_of_episodes):

        obs = env.reset()
        ep_reward = 0

        for step in range(num_of_steps):
            action = select_action(obs, policy)
            obs, reward, done, _ = env.step(action)
            policy.rewards.append(reward)
            ep_reward += reward
    #         if ep % log_interval == 0:
    #             env.render()
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if ep % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  ep, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, step))
            break

        ######## SAVING THE MODEL TO CONTINUE TRAINING LATER ###########
        state = {
            'episode': ep,
            'state_dict': policy.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        torch.save(state, filepath + 'policy_state.pth')
        #################################################################

Episode 0	Last reward: -75.67	Average reward: 5.72
Episode 100	Last reward: -207.12	Average reward: -141.66
Episode 200	Last reward: -298.06	Average reward: -119.06
Episode 300	Last reward: -1.40	Average reward: -101.15
Episode 400	Last reward: 21.95	Average reward: -25.13
Episode 500	Last reward: -93.21	Average reward: -54.36
Episode 600	Last reward: 82.66	Average reward: 1.94
Episode 700	Last reward: -196.67	Average reward: -80.34
Episode 800	Last reward: 69.28	Average reward: 62.57
Episode 900	Last reward: 211.06	Average reward: 40.56
Episode 1000	Last reward: -28.61	Average reward: 24.50
Episode 1100	Last reward: -85.98	Average reward: -16.44
Episode 1200	Last reward: -50.47	Average reward: 28.13
Episode 1300	Last reward: 1.07	Average reward: 39.17
Episode 1400	Last reward: -42.39	Average reward: 97.06
Episode 1500	Last reward: 9.21	Average reward: 69.35
Episode 1600	Last reward: -23.79	Average reward: -13.47
Episode 1700	Last reward: 101.74	Average reward: 48.42
Episode 1800	Last 