In [2]:
import random
import torch
from torch import nn
from torch.distributions import Categorical
import torch.nn.functional as F
import numpy as np
from collections import deque

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
class TacoYaki():
    def __init__(self):
        self.size = 4
        self.board = np.ones(self.size*self.size)
        self.actions = {}
        self.action_space()
        self.encoded_actions = np.eye(self.size*self.size)
        
    def reset(self):
        self.board = np.ones(self.size*self.size)
    
    def sample_game(self, shuffles):
        self.reset()
        next_state = self.board.copy()
        action = random.choice(list(self.actions.keys()))
        state, _, _, _ = self.step(action)
        replay_buffer = []
        done, score, reward = True, 0, 0
        for i in range(random.randint(4, shuffles)):
            replay_buffer.append((state, action, next_state, score, done, reward))
            next_state = state
            action = random.choice(list(self.actions.keys()))
            state, reward, done, _ = self.step(action)
            score += reward
        replay_buffer.append((state, action, next_state, score, done, reward))
        return self.board, list(reversed(replay_buffer))
        
    def step(self, action):
        x, y = self.action_to_coordinate(action)
        self.board[action] = not self.board[action]
        if x - 1 >= 0:
            index = self.size * (x - 1) + y
            self.board[index] = not self.board[index]
        if y - 1 >= 0:
            index = self.size * x + y - 1
            self.board[index] = not self.board[index]
        if x + 1 < self.size:
            index = self.size * (x + 1) + y
            self.board[index] = not self.board[index]
        if y + 1 < self.size:
            index = self.size * x + y + 1
            self.board[index] = not self.board[index]
        done, reward = self.complete()
        return self.board.copy(), reward, done, 0    
            
    def show_board(self):
        print("----")
        for i in range(self.size):
            for j in range(self.size):
                print(self.board[self.size * i + j], end="\t")
            print()
            
    def action_to_coordinate(self, action):
        if action <= 15 and action >= 0:
            return self.actions[action]
        else:
            print("Actions is not present")
            return -1
            
    def action_space(self):
        for i in range(self.size):
            for j in range(self.size):
                self.actions[self.size*i + j] = (i, j)
                
    def complete(self):
        for cell in self.board:
            if cell == False:
                return False, -1
        return True, 0
    
    def set_state(self, state):
        self.board = state.copy()

In [5]:
env = TacoYaki()
state, replay_buffer = env.sample_game(50)
env.show_board()
for event in replay_buffer:
    state, action, next_state, score, done, reward = event
    env.step(action)
# print(torch.argmax(actor.act(state)).item(), actor.act(state))
env.show_board()

----
1.0	0.0	1.0	1.0	
0.0	1.0	1.0	0.0	
0.0	0.0	0.0	0.0	
1.0	0.0	1.0	1.0	
----
1.0	1.0	1.0	1.0	
1.0	1.0	1.0	1.0	
1.0	1.0	1.0	1.0	
1.0	1.0	1.0	1.0	


In [8]:
class NeuralNet(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(NeuralNet, self).__init__()

        """ CODE HERE:
                Implement the neural network here
        """
        self.fc1 = nn.Linear(input_dims, hidden_dims)
        self.fc2 = nn.Linear(hidden_dims, hidden_dims)
        self.fc3 = nn.Linear(hidden_dims, output_dims)

    def forward(self, state):
        """ CODE HERE:
                Implement the forward propagation
        """
        x = torch.from_numpy(state).float().unsqueeze(0).to(device)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        return actions

In [9]:
env = TacoYaki()


mem_size = 50000
learning_rate= 0.001
solved_score = 300
gamma = 0.99
batch_size = 10000
episodes = 10000
shuffles = 30

action_num = 16
hidden_dim = 265
observation_dim = 16
eps_start = 1
eps_end = 0.01
eps_dec = 5e-4

CELoss = nn.CrossEntropyLoss()
MSELoss = nn.MSELoss()

policy_net = NeuralNet(observation_dim, hidden_dim, action_num).to(device)

target_net = NeuralNet(observation_dim, hidden_dim, action_num).to(device) 
behavior_net = NeuralNet(observation_dim, hidden_dim, action_num).to(device) 

# behavior_net.load_state_dict(target_net.state_dict())

optimizer= torch.optim.Adam(target_net.parameters(), lr=learning_rate)

In [None]:
# implementation of Deep value Iteration
recent_losses = deque(maxlen = 100)
testing = []

buffers = []
for ep in range(episodes):
    state, replay_buffer = env.sample_game(5)
    mean_loss = 100
    buffers.append(replay_buffer)
    steps = 0
    while mean_loss > 1:
        state, action, next_state, score, done, reward = random.choice(replay_buffer)
        testing.append(state)
        pred_actions = target_net.forward(state)
        target = torch.from_numpy(env.encoded_actions[action]).reshape(1,-1).float().to(device)
#         print(pred_actions, target)
        loss = CELoss(pred_actions, target)
        
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if steps == 0:
            print(loss.item())
        
        recent_losses.append(loss.item())
        steps += 1
        
        if (steps+1) % 1000 == 0:
            mean_loss = np.array(recent_losses).mean()
            print("mean step loss in last 1000 steps ",mean_loss)
    print("\n" + "New episode " + str(ep+1))


2.6806201934814453
mean step loss in last 1000 steps  0.18189659542797018

New episode 1
8.048510551452637
mean step loss in last 1000 steps  0.00019717803559615276

New episode 2
12.577062606811523
mean step loss in last 1000 steps  0.0003666203760076314

New episode 3
11.129538536071777
mean step loss in last 1000 steps  0.0006606661716068629

New episode 4
17.627931594848633
mean step loss in last 1000 steps  0.0004925557880778797

New episode 5
10.612971305847168
mean step loss in last 1000 steps  0.0004262690563336946

New episode 6
0.7864964008331299
mean step loss in last 1000 steps  0.0004729827679693699

New episode 7
10.260015487670898
mean step loss in last 1000 steps  0.0003028784567140974

New episode 8
7.6175971031188965
mean step loss in last 1000 steps  0.0004441319283796474

New episode 9
7.191539287567139
mean step loss in last 1000 steps  0.0003131633539305767

New episode 10
11.3236083984375
mean step loss in last 1000 steps  0.00044837791938334703

New episode 11
1

In [39]:
for buffer in buffers:
    print()
    for event in buffer:
        state, action, next_state, score, done, reward = event
        act= torch.argmax(target_net(state))
        print(action, act)


10 tensor(8, device='cuda:0')
7 tensor(1, device='cuda:0')
11 tensor(3, device='cuda:0')
8 tensor(14, device='cuda:0')
12 tensor(1, device='cuda:0')
9 tensor(14, device='cuda:0')
1 tensor(8, device='cuda:0')
0 tensor(14, device='cuda:0')
6 tensor(3, device='cuda:0')
3 tensor(15, device='cuda:0')
12 tensor(15, device='cuda:0')
14 tensor(15, device='cuda:0')
12 tensor(15, device='cuda:0')
7 tensor(12, device='cuda:0')
11 tensor(12, device='cuda:0')
14 tensor(0, device='cuda:0')
0 tensor(12, device='cuda:0')
9 tensor(5, device='cuda:0')
11 tensor(6, device='cuda:0')
11 tensor(15, device='cuda:0')
4 tensor(6, device='cuda:0')
14 tensor(14, device='cuda:0')

9 tensor(3, device='cuda:0')
13 tensor(12, device='cuda:0')
3 tensor(5, device='cuda:0')
7 tensor(15, device='cuda:0')
8 tensor(15, device='cuda:0')
8 tensor(12, device='cuda:0')
9 tensor(15, device='cuda:0')
6 tensor(15, device='cuda:0')
4 tensor(15, device='cuda:0')
0 tensor(9, device='cuda:0')
12 tensor(1, device='cuda:0')
6 tensor(

7 tensor(5, device='cuda:0')
14 tensor(14, device='cuda:0')
9 tensor(5, device='cuda:0')
3 tensor(15, device='cuda:0')
6 tensor(9, device='cuda:0')
1 tensor(6, device='cuda:0')
14 tensor(14, device='cuda:0')
4 tensor(7, device='cuda:0')
1 tensor(3, device='cuda:0')

5 tensor(11, device='cuda:0')
2 tensor(12, device='cuda:0')
4 tensor(12, device='cuda:0')
5 tensor(3, device='cuda:0')
8 tensor(12, device='cuda:0')
10 tensor(12, device='cuda:0')
15 tensor(12, device='cuda:0')
4 tensor(15, device='cuda:0')
1 tensor(9, device='cuda:0')
6 tensor(15, device='cuda:0')
0 tensor(14, device='cuda:0')
2 tensor(14, device='cuda:0')
3 tensor(1, device='cuda:0')
8 tensor(6, device='cuda:0')
6 tensor(3, device='cuda:0')
2 tensor(14, device='cuda:0')
10 tensor(7, device='cuda:0')
6 tensor(12, device='cuda:0')
2 tensor(8, device='cuda:0')
1 tensor(1, device='cuda:0')
5 tensor(0, device='cuda:0')
12 tensor(12, device='cuda:0')
6 tensor(8, device='cuda:0')

8 tensor(6, device='cuda:0')
12 tensor(6, device

12 tensor(15, device='cuda:0')
15 tensor(12, device='cuda:0')
13 tensor(14, device='cuda:0')

0 tensor(5, device='cuda:0')
10 tensor(6, device='cuda:0')
13 tensor(15, device='cuda:0')
9 tensor(13, device='cuda:0')
2 tensor(15, device='cuda:0')
14 tensor(12, device='cuda:0')
12 tensor(12, device='cuda:0')
12 tensor(14, device='cuda:0')
10 tensor(12, device='cuda:0')
2 tensor(12, device='cuda:0')
12 tensor(15, device='cuda:0')

9 tensor(9, device='cuda:0')
0 tensor(14, device='cuda:0')
15 tensor(14, device='cuda:0')
2 tensor(12, device='cuda:0')
15 tensor(12, device='cuda:0')
14 tensor(14, device='cuda:0')
0 tensor(12, device='cuda:0')
0 tensor(7, device='cuda:0')

0 tensor(7, device='cuda:0')
13 tensor(3, device='cuda:0')
11 tensor(15, device='cuda:0')
8 tensor(9, device='cuda:0')
5 tensor(9, device='cuda:0')
1 tensor(12, device='cuda:0')
3 tensor(1, device='cuda:0')
0 tensor(14, device='cuda:0')
5 tensor(11, device='cuda:0')

4 tensor(15, device='cuda:0')
12 tensor(12, device='cuda:0')

In [None]:
# implementation of Deep q learning
