#### Importing the packages

In [None]:
%run Packages/Libraries.ipynb

#### Setting the parameters

In [None]:
N_GAMES = 50000 # number of games to be played
DEVICE = 'cuda' # device to be used
SYNC_TIME = 4 # the target network is syncronized every SYNC_TIME games

GAMMA = 0.95 # gamma parameter in the Bellman equation
LEARNING_RATE = 1e-4 # learning rate of the optimizer
BATCH_SIZE = 24 
N_ACTIONS = 4 # number of possible actions

EPSILON_START = 0.9 # start value of the epsilon parameter (both for epsilon-greedy and softmax policies)
EPSILON_END = 0.1 # end value of the epsilon parameter
EPSILON_GAMES = 1500 # number of games taken to go from start to end value for the epsilon parameter
USE_SOFTMAX = True # if True then softmax policy, if False then epsilon-greedy policy

ROWS = 20 # number of rows of the maze
COLS = 20 # number of columns of the maze
MOD = 1 # 0 = narrow passages, 1 = wide passages

BUFFER_CAPACITY = 10000 # capacity of the buffer
BUFFER_START_SIZE = 2500 # the training starts after the buffer has at least BUFFER_START_SIZE elements

#### Defining the environment, buffer, agent and network

In [None]:
maze = Maze_env(ROWS, COLS, MOD)
buffer = ExperienceBuffer(BUFFER_CAPACITY)
agent = Agent(maze = maze,
              experience_buffer = buffer,
              epsilon = EPSILON_START,
              epsilon_end = EPSILON_END,
              epsilon_games = EPSILON_GAMES,
              use_softmax = USE_SOFTMAX,
              enable_decay = False
             )
net = Net_2(ROWS*COLS, N_ACTIONS, ROWS, COLS).to(DEVICE) # network for the behaviour policy
tgt_net = Net_2(ROWS*COLS, N_ACTIONS, ROWS, COLS).to(DEVICE) # target network for the updat policy
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [None]:
plt.imshow(agent.maze.reshape(ROWS,COLS))

#### Training the Agent

In [None]:
max_reward = None
tot_rewards = []

for game in range(N_GAMES):
    counter = 0
    status = 0
    print(agent.epsilon)
    while status==0:
        counter += 1
        episode_reward, status = agent.play(net, device=DEVICE)
    
        if episode_reward is not None:
            print('Game %i, moves: %i' %(game, counter))
            tot_rewards.append(episode_reward)
            mean_reward = np.mean(tot_rewards[-25:])
            print('\tEpisode reward: %.5f' %(episode_reward))
            print('\tMean reward of last 25 episodes: %.5f' %(mean_reward))
            if max_reward==None or episode_reward>max_reward:
                if max_reward is None:
                    print('\tBest reward improvement at game %i: None --> %.5f' %(game, episode_reward))
                else:
                    print('\tBest reward improvement at game %i: %.5f --> %.5f' %(game, max_reward, episode_reward))
                max_reward = episode_reward
                torch.save(net.state_dict(), "ckpt/best.ckpt")
                print('\tModel saved!')
        if len(buffer)<BUFFER_START_SIZE:
            continue
        agent.enable_decay = True
        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE, device=DEVICE)
        loss_t = calc_loss(batch, net, gamma=GAMMA, device=DEVICE)
        loss_t.backward()
        optimizer.step()
    #if (game+1)%SYNC_TIME==0:
    #    tgt_net.load_state_dict(net.state_dict())

#### Showing the solution (when the training is completed)

In [None]:
play_game(net, maze, pos, device=DEVICE)

#### Showing the flow of the maze (when the training is completed)

In [None]:
show_flow(net, maze, DEVICE)