In [1]:
import random
import torch
from torch import nn
from torch.distributions import Categorical
import torch.nn.functional as F
import numpy as np
from collections import deque
import gym

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class TacoYaki():
    def __init__(self):
        self.size = 4
        self.board = np.ones(self.size*self.size)
        self.actions = {}
        self.action_space()
        self.encoded_actions = np.eye(self.size*self.size)
        
    def reset(self):
        self.board = np.ones(self.size*self.size)
        return self.board.copy()
    
    def sample_game(self, shuffles):
        self.reset()
        next_state = self.board.copy()
        action = random.choice(list(self.actions.keys()))
        state, _, _, _ = self.step(action)
        replay_buffer = []
        done, score, reward = True, 0, 0
        for i in range(random.randint(4, shuffles)):
            replay_buffer.append((state, action, next_state, score, done, reward))
            next_state = state
            action = random.choice(list(self.actions.keys()))
            state, reward, done, _ = self.step(action)
            score += reward
        replay_buffer.append((state, action, next_state, score, done, reward))
        return self.board.copy(), list(reversed(replay_buffer))
        
    def step(self, action):
        x, y = self.action_to_coordinate(action)
        self.board[action] = not self.board[action]
        if x - 1 >= 0:
            index = self.size * (x - 1) + y
            self.board[index] = not self.board[index]
        if y - 1 >= 0:
            index = self.size * x + y - 1
            self.board[index] = not self.board[index]
        if x + 1 < self.size:
            index = self.size * (x + 1) + y
            self.board[index] = not self.board[index]
        if y + 1 < self.size:
            index = self.size * x + y + 1
            self.board[index] = not self.board[index]
        done, reward = self.complete()
        return self.board.copy(), reward, done, 0    
            
    def show_board(self):
        print("----")
        for i in range(self.size):
            for j in range(self.size):
                print(self.board[self.size * i + j], end="\t")
            print()
            
    def action_to_coordinate(self, action):
        if action <= 15 and action >= 0:
            return self.actions[action]
        else:
            print("Actions is not present")
            return -1
            
    def action_space(self):
        for i in range(self.size):
            for j in range(self.size):
                self.actions[self.size*i + j] = (i, j)
                
    def complete(self):
        for cell in self.board:
            if cell == False:
                return False, -1
        return True, 0
    
    def set_state(self, state):
        self.board = state.copy()

In [4]:
env = TacoYaki()
state, replay_buffer = env.sample_game(50)
env.show_board()
for event in replay_buffer:
    state, action, next_state, score, done, reward = event
    env.step(action)
# print(torch.argmax(actor.act(state)).item(), actor.act(state))
env.show_board()

----
0.0	1.0	0.0	0.0	
1.0	0.0	0.0	0.0	
0.0	0.0	0.0	1.0	
0.0	1.0	0.0	0.0	
----
1.0	1.0	1.0	1.0	
1.0	1.0	1.0	1.0	
1.0	1.0	1.0	1.0	
1.0	1.0	1.0	1.0	


In [5]:
class NeuralNet(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(NeuralNet, self).__init__()

        """ CODE HERE:
                Implement the neural network here
        """
        self.fc1 = nn.Linear(input_dims, hidden_dims)
        self.fc2 = nn.Linear(hidden_dims, hidden_dims)
        self.fc3 = nn.Linear(hidden_dims, output_dims)

    def forward(self, x):
        """ CODE HERE:
                Implement the forward propagation
        """
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [6]:
class Agent:
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                 max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        self.iter_cntr = 0
        self.replace_target = 100

        self.Q_eval = NeuralNet(input_dims, 256, n_actions).to(device)
        self.optimizer = torch.optim.Adam(self.Q_eval.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()
        
        self.state_memory = np.zeros((self.mem_size, input_dims),
                                     dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, input_dims),
                                         dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = terminal

        self.mem_cntr += 1

    def get_action(self, state):
        if np.random.random() < self.epsilon:  # with probability eps, the agent selects a random action
            action = np.random.choice(self.action_space)
            return action
        else:  # with probability 1 - eps, the agent selects a greedy policy
            with torch.no_grad():
                state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
                q_values = self.Q_eval(state_tensor)
                action = torch.argmax(q_values)
            return action.item()

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = torch.tensor(self.state_memory[batch]).to(device)
        new_state_batch = torch.tensor(
                self.new_state_memory[batch]).to(device)
        action_batch = self.action_memory[batch]
        reward_batch = torch.tensor(
                self.reward_memory[batch]).to(device)
        terminal_batch = torch.tensor(
                self.terminal_memory[batch]).to(device)

        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        q_target = reward_batch + self.gamma*torch.max(q_next, dim=1)[0]

        loss = self.loss(q_target, q_eval)
        loss.backward()
        self.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = self.epsilon - self.eps_dec \
            if self.epsilon > self.eps_min else self.eps_min

In [7]:
env = gym.make("LunarLander-v2")
agent = Agent(gamma = 0.99, epsilon = 1.0, lr = 0.003, input_dims=8, batch_size = 64, n_actions= 4)

scores , eps_history = deque(maxlen = 100), deque(maxlen = 100)
n_games = 5000
for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    while not done:
        action = agent.get_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        
        agent.store_transition(observation,  action, reward, observation_, done)
        agent.learn()
        observation = observation_
    scores.append(score)
    eps_history.append(agent.epsilon)
        
    avg_score = np.mean(scores)
        
    print('episode', i, 'avg score:', avg_score, 'epsilon:', agent.epsilon)

  deprecation(
  deprecation(
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)


episode 0 avg score: -60.606841096082874 epsilon: 0.9980000000000002
episode 1 avg score: -137.6427314358209 epsilon: 0.9605000000000044
episode 2 avg score: -125.50977093894399 epsilon: 0.9240000000000084
episode 3 avg score: -112.63591761784154 epsilon: 0.8905000000000121
episode 4 avg score: -170.23707796393586 epsilon: 0.8145000000000204
episode 5 avg score: -188.57949826968377 epsilon: 0.755000000000027
episode 6 avg score: -168.50925875354636 epsilon: 0.6995000000000331
episode 7 avg score: -162.15045699763337 epsilon: 0.6565000000000378
episode 8 avg score: -147.8887411461602 epsilon: 0.6030000000000437
episode 9 avg score: -132.93227015100564 epsilon: 0.5595000000000485
episode 10 avg score: -124.83404153671391 epsilon: 0.48700000000005506
episode 11 avg score: -118.77524387152037 epsilon: 0.431500000000055
episode 12 avg score: -112.4127544181383 epsilon: 0.36800000000005495
episode 13 avg score: -106.45850511955585 epsilon: 0.2025000000000548
episode 14 avg score: -101.076982

KeyboardInterrupt: 

In [9]:
env = TacoYaki()


mem_size = 40000
learning_rate= 0.001
solved_score = 300
gamma = 0.99
batch_size = 10000
episodes = 10000
shuffles = 30

action_num = 16
hidden_dim = 265
observation_dim = 16

CELoss = nn.CrossEntropyLoss()

target_net = NeuralNet(observation_dim, hidden_dim, action_num).to(device)

# behavior_net.load_state_dict(target_net.state_dict())

optimizer= torch.optim.Adam(target_net.parameters(), lr=learning_rate)

In [17]:
# implementation of Deep value Iteration
recent_losses = deque(maxlen = 100)
testing = []

buffers = []
for ep in range(episodes):
    state, replay_buffer = env.sample_game(5)
    mean_loss = 100
    buffers.append(replay_buffer)
    steps = 0
    while mean_loss > 1:
        state, action, next_state, score, done, reward = random.choice(replay_buffer)
        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
        testing.append(state)
        pred_actions = target_net.forward(state_tensor)
        target = torch.from_numpy(env.encoded_actions[action]).reshape(1,-1).float().to(device)
#         print(pred_actions, target)
        loss = CELoss(pred_actions, target)
        
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if steps == 0:
            print(loss.item())
        
        recent_losses.append(loss.item())
        steps += 1
        
        if (steps+1) % 1000 == 0:
            mean_loss = np.array(recent_losses).mean()
            print("mean step loss in last 1000 steps ",mean_loss)
    print("\n" + "New episode " + str(ep+1))

# for buffer in buffers:
#     print()
#     for event in buffer:
#         state, action, next_state, score, done, reward = event
#         act= torch.argmax(target_net(state))
#         print(action, act)

8.326125144958496
mean step loss in last 1000 steps  0.0004837204302020837

New episode 1
17.278133392333984
mean step loss in last 1000 steps  0.000515870480958256

New episode 2
0.19495660066604614
mean step loss in last 1000 steps  0.00032464720989082706

New episode 3
0.5273777842521667
mean step loss in last 1000 steps  0.00011706209115800448

New episode 4
4.591670036315918
mean step loss in last 1000 steps  0.0002784270903657671

New episode 5
1.1491751670837402
mean step loss in last 1000 steps  0.0001620093733436079

New episode 6
5.705254077911377
mean step loss in last 1000 steps  0.33008355786427274

New episode 7
0.00014959646796341985
mean step loss in last 1000 steps  0.00011245207373121958

New episode 8
1.0728830375228426e-06
mean step loss in last 1000 steps  0.00017271583827096037

New episode 9
18.696380615234375
mean step loss in last 1000 steps  0.2521622660702269

New episode 10
11.186315536499023
mean step loss in last 1000 steps  0.2683446784286207

New episode

KeyboardInterrupt: 

In [15]:
state, _ = env.sample_game(5)
scores = []
for i in range(500):
    score = 0
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        state = next_state
        score += 1
    scores.append(score)
    print(np.mean(scores), len(scores))

5835.0 1
4518.5 2
3202.3333333333335 3
2550.25 4
2309.0 5
3471.8333333333335 6
3541.5714285714284 7
3099.125 8
3478.5555555555557 9
3130.9 10
2990.4545454545455 11
3010.25 12
3003.0 13
3624.9285714285716 14
4175.4 15
4190.6875 16
4780.529411764706 17
4962.055555555556 18
4768.473684210527 19
4552.65 20
4489.952380952381 21
4367.227272727273 22
4230.304347826087 23
4201.375 24
4057.32 25
3921.1153846153848 26
3807.962962962963 27
3801.1785714285716 28
3776.5172413793102 29
3666.3 30
3590.7419354838707 31
3564.78125 32
3456.818181818182 33
3452.1470588235293 34
3591.6285714285714 35
3494.75 36
3428.0810810810813 37
3341.7105263157896 38
3365.4102564102564 39
3372.175 40
3289.9756097560976 41
3309.2619047619046 42
3522.1627906976746 43
3564.431818181818 44
3485.266666666667 45
3478.0652173913045 46
3408.1063829787236 47
3446.9375 48
3437.3673469387754 49
3392.42 50
3325.9411764705883 51
3427.826923076923 52
3404.622641509434 53
3345.796296296296 54
3285.0 55
3262.625 56
3211.8070175438597

KeyboardInterrupt: 