## Define Tetris Game

In [None]:
import numpy as np
import random

class Tetris:
    
    UNDEFINED = -1
    
    TILES = [
        [
            [[0, 2]],  # Tile 0, orientation 0.
            [[0, 1], [0, 1]],  # Tile 0, orientation 1.
        ],
        [
            [[0, 1], [1, 2]],  # Tile 1, orientation 0.
            [[1, 2], [0, 1]],  # Tile 1, orientation 0.
        ],
        [
            [[0, 2], [1, 2]],  # Tile 2, orientation 0.
            [[0, 2], [0, 1]],  # Tile 2, orientation 1.
            [[0, 1], [0, 2]],  # Tile 2, orientation 2.
            [[1, 2], [0, 2]],  # Tile 2, orientation 3.
        ],
        [
            [[0, 2], [0, 2]],  # Tile 4, orientation 0.
        ],
    ]

    def __init__(self, rows, cols, max_tiles, random_seed):
        self.rows, self.cols = rows, cols
        self.max_tiles = max_tiles
        self.random_seed = random_seed
        
        self.restart()
        
    def restart(self):
        self.gameover = False
        self.tile_count = 0
        self.reward = 0
        self.board = np.full((self.rows, self.cols), Tetris.UNDEFINED)
        self.current_tile = Tetris.UNDEFINED
        self.tile_x = Tetris.UNDEFINED
        self.tile_y = Tetris.UNDEFINED
        self.tile_orientation = Tetris.UNDEFINED
        
        # Create predefined tile sequence, used if stochastic_prob=0
        rand_state = random.getstate()
        random.seed(self.random_seed)
        self.tile_sequence = [random.randint(0, len(Tetris.TILES) - 1) 
                              for x in range(self.max_tiles)]
        random.setstate(rand_state)
        
        self.next_tile()
        
    def next_tile(self):
        if self.tile_count < self.max_tiles:
            if self.random_seed:
                self.current_tile = self.tile_sequence[self.tile_count]
            else:
                self.current_tile = random.randint(0, len(Tetris.TILES) - 1)
            
            self.tile_x = self.cols // 2
            self.tile_y = self.rows
            self.tile_orientation = 0
            self.tile_count += 1
        else:
            self.gameover = True
    
    def move_left(self):
        if self.tile_x - 1 >= 0:
            self.tile_x -= 1
            return True
        else:
            return False
    
    def move_right(self):
        tile_width = len(Tetris.TILES[self.current_tile][self.tile_orientation])
        if self.tile_x + 1 <= self.cols - tile_width:
            self.tile_x += 1
            return True
        else:
            return False
    
    def rotate(self):
        new_orientation = ((self.tile_orientation + 1) 
                           % len(Tetris.TILES[self.current_tile]))
        tile_width = len(Tetris.TILES[self.current_tile][new_orientation])
        if self.tile_x <= self.cols - tile_width:
            self.tile_orientation = new_orientation
            return True
        else:
            return False
        
    def drop(self):
        tile = Tetris.TILES[self.current_tile][self.tile_orientation]
        
        # Find first location where the piece collides with occupied locations.
        self.tile_y = 0
        for x in range(len(tile)):
            # Find first occupied location in this column            
            cury = -1
            for y in range(self.rows -1, -1, -1):
                if self.board[y, self.tile_x + x] > 0:
                    # Calculate the y position for this column if no other columns are taken into account
                    cury = y + 1 - tile[x][0]
                    break
            # Use the largest y position for all columns of the tile
            if self.tile_y < cury:
                self.tile_y = cury

        if self.tile_y + np.max(tile) > self.rows:
            self.gameover = True
            dreward = -100
        else:
            # Change board entries at the newly placed tile to occupied.
            for x in range(len(tile)):
                self.board[self.tile_y + tile[x][0]:self.tile_y + tile[x][1], 
                           x + self.tile_x] = 1

            # Remove full lines.
            removed_lines = 0
            for y in range(self.rows - 1, -1, -1):
                if np.sum(self.board[y, :]) == self.cols:
                    removed_lines += 1
                    for y1 in range(y, self.rows - 1):
                        self.board[y1, :] = self.board[y1 + 1, :]
                    self.board[self.rows - 1, :] = Tetris.UNDEFINED
            
            dreward = 10 ** (removed_lines - 1) if removed_lines > 0 else 0
            
            # Choose the next tile.
            self.next_tile()
        
        self.reward += dreward
        
        return dreward

## Adaptation to Q-Learning

In [None]:
class QLTetris(Tetris):
    
    def __init__(self, rows, cols, max_tiles, random_seed):
        super().__init__(rows, cols, max_tiles, random_seed)
    
    def teleport(self, new_x, new_orientation):
        if 0 <= new_orientation < len(Tetris.TILES[self.current_tile]):  # Valid orientation.
            tile_width = len(Tetris.TILES[self.current_tile][new_orientation])
            if 0 <= new_x <= self.cols - tile_width:
                self.tile_x = new_x
                self.tile_orientation = new_orientation
                return True
        return False

In reinforcement learning, the discount factor γ (gamma) is a crucial parameter that determines how future rewards are taken into account in the agent's decision-making process. Typical values for gamma generally range between 0 and 1, and the choice of this value depends on the specific characteristics of the problem and the desired balance between immediate and future rewards. Here's an overview of how different values of gamma affect the learning process:

1. **γ close to 0**: 
   - When gamma is close to 0, the agent tends to be short-sighted and prioritizes immediate rewards over future rewards. This is because future rewards are heavily discounted, making them less significant in the agent's decision-making process.
   - Use this approach in environments where immediate rewards are more important or when the future is highly uncertain.

2. **γ close to 1**:
   - When gamma is close to 1, the agent considers future rewards almost as important as immediate rewards. This encourages the agent to plan for the long term and can lead to more strategic behavior.
   - Use this approach in environments where it's important to consider the long-term consequences of actions and where future rewards are relatively certain.

3. **Intermediate values (e.g., 0.8, 0.9)**:
   - Intermediate values of gamma strike a balance between valuing immediate and future rewards. These values are often used in practice as they allow the agent to consider both short-term and long-term outcomes.
   - This approach is suitable for many standard reinforcement learning problems.

It's important to note that the optimal value of gamma can vary depending on the specific problem and the environment's dynamics. In practice, finding the best value for gamma might require experimentation and tuning. Additionally, a gamma value of exactly 1 can be used in certain theoretical settings, particularly when dealing with undiscounted finite-horizon problems, but in practice, a value slightly less than 1 is often preferred to ensure convergence and stability.

In [None]:
class QLAgent:
    
    def __init__(self, alpha, gamma, epsilon, games, tetris):
        self.alpha = alpha  # Alpha is the learning rate.
        self.gamma = gamma  # Discount factor.
        self.epsilon = epsilon  # Probability to choose a random action in the epsilon-greedy policy.
        self.games = games
        self.game = 0
        self.tetris = tetris
        
        self.rewards = np.zeros(games)

        self.state_size = (
            self.tetris.cols * self.tetris.rows  # Cells in board.
            + 1 + np.floor(np.log2(len(Tetris.TILES) - 1)).astype(int)  # Tiles
        )
        self.state_num = 2 ** self.state_size
        
        self.position_num = self.tetris.rows
        self.orientation_num = np.max([len(tile) for tile in Tetris.TILES])
        self.action_num = self.position_num * self.orientation_num

        self.Q_table = np.zeros((self.state_num, self.action_num))
        
        self.update_state()

    def update_state(self):
        # Convert tile to binary list.
        tile = bin(self.tetris.current_tile)[2:]

        # Convert board to binary list.
        board = np.copy(self.tetris.board.reshape((-1,))).astype(int)
        board[board == Tetris.UNDEFINED] = 0
        
        self.state_binary = np.append(tile, board)
        self.state = int("".join(str(i) for i in self.state_binary), 2)

    def next_turn(self):
        if self.tetris.gameover:
            self.rewards[self.game] = self.tetris.reward
            if self.game % 100 == 0:
                av_reward = np.mean(self.rewards[self.game - 100:self.game])
                print(f"game {self.game}/{self.games} reward {av_reward}")
            
            self.game += 1
            if self.game < self.games:
                self.tetris.restart()
            else:
                np.savetxt('Q_table.txt', self.Q_table)
                return False  # Finish.
        else:
            old_state = self.state
            
            # Select action.
            if np.random.rand() < self.epsilon:
                action = np.random.randint(self.action_num)
            else:
                action = np.argmax(self.Q_table[old_state, :])
            
            # Extract rotation and movement from action parameter.
            new_x = action // self.position_num
            new_orientation = action % self.orientation_num

            # Execute action and drop tile.
            if self.tetris.teleport(new_x, new_orientation):
                reward = self.tetris.drop()
                    
                # Update the state.
                self.update_state()
                new_state = self.state
                
                # Update the Q-table using the old state and the reward.
                dQ = self.alpha * (
                    reward
                    + self.gamma * np.max(self.Q_table[new_state, :])
                    - self.Q_table[old_state, action]
                )
                            
                # Update the Q-table.
                self.Q_table[old_state, action] += dQ
            else:  # Penalty for illegal move.
                self.Q_table[old_state, action] += -50
            
        return True  # Continue.

In [None]:
tetris = Tetris(rows=4, cols=4, max_tiles=50, random_seed=123456)
agent = QLAgent(alpha=0.2,gamma=1, epsilon=0, games=10000, tetris=tetris)

while agent.next_turn():
    pass

In [None]:
import matplotlib.pyplot as plt

smoothed_rewards = np.convolve(agent.rewards, np.ones(100) / 100, mode='valid')

plt.plot(smoothed_rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')


In [None]:
Q_table = agent.Q_table
non_zero_rows_mask = np.any(Q_table != 0, axis=1)
num_non_zero_rows = np.sum(non_zero_rows_mask)

# Note how only few lines are non-zero.

In [None]:
# Try also with epsilon = 0.001 and games = 100_000

tetris = Tetris(rows=4, cols=4, max_tiles=50, random_seed=123456)
agent = QLAgent(alpha=0.2, gamma=1, epsilon=0.001, games=100_000, tetris=tetris)

while agent.next_turn():
    pass

In [None]:
import matplotlib.pyplot as plt

smoothed_rewards = np.convolve(agent.rewards, np.ones(100) / 100, mode='valid')

plt.plot(smoothed_rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')

In [None]:
# Interesting to note that the number of non-zero rows increases in this case.

Q_table = agent.Q_table
non_zero_rows_mask = np.any(Q_table != 0, axis=1)
num_non_zero_rows = np.sum(non_zero_rows_mask)


Interesting to note also how once the sequence is fixed it learns very quickly and better than before to do it.

## Q-Learning with a Random Tile Sequence

In [None]:
tetris = Tetris(rows=4, cols=4, max_tiles=50, random_seed=False)
agent = QLAgent(alpha=0.2, gamma=1, epsilon=0.001, games=1_000_000, tetris=tetris)

while agent.next_turn():
    pass

In [None]:
import matplotlib.pyplot as plt

smoothed_rewards = np.convolve(agent.rewards, np.ones(100) / 100, mode='valid')

plt.plot(smoothed_rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')

## Deep Q-Learning

In [None]:
import deeplay as dl
import torch

class DQLAgent(dl.Application):
    pass


In Deep Q-Learning, Q-Network (QNet) and Target Network (TargetNet) are two key components, and they serve different purposes in the learning process:

1. **Q-Network (QNet):** 
    - **Primary Role:** QNet is the main neural network that is being trained to approximate the Q-value function. The Q-value function estimates the total expected reward that an agent can obtain, starting from a given state and performing a specific action. This network is actively updated at every step (or after a certain number of steps) during the training process.
    - **Learning:** QNet learns by continuously updating its weights based on the Temporal Difference (TD) error, which is the difference between the predicted Q-value and the target Q-value. The update is typically done using backpropagation and an optimization algorithm like Adam or RMSprop.

2. **Target Network (TargetNet):**
    - **Primary Role:** TargetNet is a separate neural network that has the same architecture as QNet but with a different set of weights. Its main role is to provide a stable target for the QNet to learn from. The weights of TargetNet are periodically updated with the weights of QNet.
    - **Stability:** The key reason for using a separate target network is to stabilize the learning process. In Q-learning, the target Q-value (used in calculating the TD error) depends on the Q-value itself. This can lead to instability and divergence if the same network is used for both selecting actions and evaluating the value of those actions. By using a separate TargetNet for generating the target Q-values, the learning process becomes more stable.

In summary, QNet is the network that is actively learning and being updated continually, while TargetNet is used to generate stable target values for the QNet to learn from. The use of a TargetNet helps in stabilizing the training process by providing consistent targets for a period of time before being updated.