In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# import datasets
# import transformers
import torch

from tqdm.notebook import trange
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch.nn as nn

from collections import defaultdict
import numpy as np

from utilities import get_dataloader_random_reshuffle

In [19]:
config = {
    "seed": 0, 
    "device": "cuda", 
    "features_dtype": torch.long,
    "ensemble_shape": (3,5), 
    "n_patches": [2, 4, 8],  # 3 values for n_patches
    "hidden_layer_dim": [12, 15, 18, 21, 24], # 5 values for hidden layer dimensions ( they are all divisible by 3 = n_head
    "n_heads": 3
}

In [3]:
torch.manual_seed(config["seed"])

<torch._C.Generator at 0xa371f30>

In [4]:
chess_features, chess_labels = torch.load('data/sample_dataset.pt')

  chess_features, chess_labels = torch.load('data/sample_dataset.pt')


In [5]:
features_train, features_valid, labels_train, labels_valid = train_test_split(
    chess_features, chess_labels, test_size=0.2, random_state=42
)

In [6]:
# We can write a function to check the number of classification classes a set of board games have
def get_out_d(lables): 
    unique_pairs = torch.unique(lables, dim=0)  # Find unique rows
    num_unique_pairs = unique_pairs.size(0)

    return num_unique_pairs


In [7]:
#patching the images
def patchify(images, n_patches):
    '''
    n is the number of images, 
    c is the number of channels, in our case it will be 9, 
    h is the height of the image and w is the width of the image, both be 8 in our case
    '''
    n, h, w, c = images.shape 

    assert h == w, "Patchify method is implemented for square images only"
    
    patches = torch.zeros(n, n_patches ** 2, h * w * c// n_patches ** 2, device=config['device'])
    patch_size = h // n_patches

    for idx, image in enumerate(images):
        for i in range(n_patches):
            for j in range(n_patches):
                patch = image[i * patch_size: (i + 1) * patch_size, j * patch_size: (j + 1) * patch_size, :]
                patches[idx, i * n_patches + j] = patch.flatten()
    return patches

In [8]:
#getting positional embeddings for each token. Here we used sin cos function (work by Vaswani et).
def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d, device=config["device"])
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin(i / (10000 ** (j / d))) if j % 2 == 0 else np.cos(i / (10000 ** ((j - 1) / d)))
    return result

Idea for two move: encode can_move element of d vector to 0 (restricting movement to only piece selected)

We have two hyperparameters for processing the data, "n_patches" and "hidden_layer_dimension", we will attempt to try different combinations of them using ensemble. 

In [9]:
class PreViT(nn.Module):
  "Here we have initialization of the model and patching"
  def __init__(self, chw=(9, 8, 8), config = None):
    # Super constructor
    super(PreViT, self).__init__()

    assert config is not None, "Config must provide ensemble shape, n_patches, and hidden_layer_dim"
    # Attributes
    self.chw = chw # (C, H, W)
    self.ensemble_shape = config["ensemble_shape"]
    self.n_patches_values = config["n_patches"]  # List of 3 values for n_patches
    self.hidden_layer_dims = config["hidden_layer_dim"]

    assert len(self.n_patches_values) == self.ensemble_shape[0], "n_patches must have 3 values"
    assert len(self.hidden_layer_dims) == self.ensemble_shape[1], "hidden_layer_dim must have 5 values"

    for n_patches in self.n_patches_values:
      assert chw[1] % n_patches == 0, f"Height {chw[1]} is not divisible by n_patches={n_patches}"
      assert chw[2] % n_patches == 0, f"Width {chw[2]} is not divisible by n_patches={n_patches}"
    
    self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)

    # Patching and Linear Mapping (to a vector of hidden_dim) "Tokenize"
    self.linear_mappers = nn.ModuleList([
            nn.ModuleList([
                nn.Linear(
                    int(chw[0] * (chw[1] / n_patches) * (chw[2] / n_patches)), hidden_dim
                )
                for hidden_dim in self.hidden_layer_dims
            ])
            for n_patches in self.n_patches_values
        ])
    
    # Add the special token for the start of each block
    self.class_tokens = nn.ParameterList([
            nn.ParameterList([
                nn.Parameter(torch.rand(1, hidden_dim))
                for hidden_dim in self.hidden_layer_dims
            ])
            for _ in self.n_patches_values
        ])
    
    # Add Positional Embeddings 
    self.pos_embeddings = nn.ParameterList([
    nn.ParameterList([
        nn.Parameter(
            torch.tensor(get_positional_embeddings(n_patches ** 2 + 1, hidden_dim))
        )
        for hidden_dim in self.hidden_layer_dims
    ])
    for n_patches in self.n_patches_values
])
    # Make sure that the Positional Embeddings are not learnable. 
    for param_list in self.pos_embeddings:
        for param in param_list:
            param.requires_grad = False
    
  def forward(self, images, ensemble_idx):
        """
            images: Input images of shape (# of games, C, H, W)
            ensemble_idx: A tuple (n_patches_idx, hidden_layer_dim_idx) indicating which ensemble configuration to use
        """
        n_patches_idx, hidden_layer_dim_idx = ensemble_idx
        n = images.shape[0]

        # Select the configuration
        n_patches = self.n_patches_values[n_patches_idx]
        linear_mapper = self.linear_mappers[n_patches_idx][hidden_layer_dim_idx]
        class_token = self.class_tokens[n_patches_idx][hidden_layer_dim_idx]
        pos_embedding = self.pos_embeddings[n_patches_idx][hidden_layer_dim_idx]

        # Patching the inputs
        patches = patchify(images, n_patches)  

        # Apply the linear mapper to the patches
        tokens = linear_mapper(patches)

        # Add the classification token
        tokens = torch.stack([torch.vstack((class_token, tokens[i])) for i in range(len(tokens))])

        # Add Positional Embeddings
        positional_embed = pos_embedding.repeat(n, 1, 1)
        out = tokens + positional_embed
        return out

In [27]:
# #Layer Normalziation which allows different hyperparameters to be applied per ensemble dimension or configuration. 
# class LayerNorm(nn.Module):
#     def __init__(
#         self,
#         config: dict,
#         normalized_shape: int | tuple[int],
#         bias=True,
#         elementwise_affine=True,
#         epsilon=1e-5,
#         normalized_offset=0
#     ):
#         super().__init__()

#         if hasattr(normalized_shape, "__int__"):
#             self.normalized_shape = (normalized_shape,)
#         else:
#             self.normalized_shape = normalized_shape

#         self.ensemble_shape = config["ensemble_shape"]
#         self.epsilon = epsilon
#         self.normalized_offset = normalized_offset

#         if elementwise_affine:
#             self.scale = torch.nn.Parameter(torch.ones(
#                 self.ensemble_shape + self.normalized_shape + (1,) * normalized_offset,
#                 device=config["device"],
#                 dtype=config["features_dtype"]
#             ))
#             if bias:
#                 self.bias = torch.nn.Parameter(torch.zeros_like(self.scale))
#             else:
#                 self.bias = None

#         else:
#             self.bias, self.scale = None, None


#     def forward(self, features: dict) -> dict:

#         ensemble_dim = len(self.ensemble_shape)
#         features = to_ensembled(self.ensemble_shape, features)

#         normalized_dim = len(self.normalized_shape)

#         batch_dim = len(features.shape) - ensemble_dim - normalized_dim - self.normalized_offset
#         normalized_range = tuple(range(
#             ensemble_dim,
#             ensemble_dim + batch_dim
#         )) + tuple(range(
#             -normalized_dim - self.normalized_offset,
#             -self.normalized_offset
#         ))

#         features = features - features.mean(dim=normalized_range, keepdim=True)
#         features = features / features.std(dim=normalized_range, keepdim=True)

#         if self.scale is not None:
#             scale = self.scale.unflatten(
#                 ensemble_dim,
#                 (1,) * batch_dim + self.normalized_shape[:1]
#             )

#             features = features * scale

#             if self.bias is not None:
#                 bias = self.bias.unflatten(
#                     ensemble_dim,
#                     (1,) * batch_dim + self.normalized_shape[:1]
#                 )
#                 features = features + bias

#         return features

In [11]:
# Here we apply multi-head self-attention to the treated tokens (input here has shape (N, 17, 18 = d))
class MyMSA(nn.Module):
    def __init__(self, config = None):
        super(MyMSA, self).__init__()

        assert config is not None, "Config dictionary must be provided"

        self.ensemble_shape = config["ensemble_shape"] 
        self.hidden_layer_dims = config["hidden_layer_dim"]
        self.n_heads = config["n_heads"]

        # Assert that every hidden_layer_dim is divisible by n_heads
        for hidden_dim in self.hidden_layer_dims:
            assert hidden_dim % self.n_heads == 0, f"Can't divide dimension {hidden_dim} into {self.n_heads} heads"

        self.ensemble_shape = config["ensemble_shape"]

            
        #creating que, key, and value mappings.
        self.q_mappings = nn.ModuleList([
            nn.ModuleList([nn.Linear(int(self.hidden_layer_dims[i] / self.n_heads), int(self.hidden_layer_dims[i] / self.n_heads)) for _ in range(self.n_heads)])
            for i in range(self.ensemble_shape[1])  # Create a separate set of heads per ensemble
        ])

        self.k_mappings = nn.ModuleList([
            nn.ModuleList([nn.Linear(int(self.hidden_layer_dims[i] / self.n_heads), int(self.hidden_layer_dims[i] / self.n_heads)) for _ in range(self.n_heads)])
            for i in range(self.ensemble_shape[1])  # Create a separate set of heads per ensemble
        ])

        self.v_mappings = nn.ModuleList([
            nn.ModuleList([nn.Linear(int(self.hidden_layer_dims[i] / self.n_heads), int(self.hidden_layer_dims[i] / self.n_heads)) for _ in range(self.n_heads)])
            for i in range(self.ensemble_shape[1])  # Create a separate set of heads per ensemble
        ])

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, sequences, ensemble_idx):
        # Sequences has shape (N, seq_length, token_dim)
        # Ensemble_idx has shape tuple (n_patches_idx, hidden_layer_dim_idx)
        # We go into shape    (N, seq_length, n_heads, token_dim / n_heads)
        # And come back to    (N, seq_length, item_dim)  (through concatenation)
        n_patches_idx, hidden_layer_dim_idx = ensemble_idx

        self.d_head = int(self.hidden_layer_dims[hidden_layer_dim_idx] / self.n_heads)

        result = []
        q_mapping_idx = self.q_mappings[hidden_layer_dim_idx]
        k_mapping_idx = self.k_mappings[hidden_layer_dim_idx]
        v_mapping_idx = self.v_mappings[hidden_layer_dim_idx]
        for sequence in sequences:
            seq_result = []
            for head in range(self.n_heads):
                q_mapping = q_mapping_idx[head]
                k_mapping = k_mapping_idx[head]
                v_mapping = v_mapping_idx[head]

                seq = sequence[:, head * self.d_head: (head + 1) * self.d_head]
                q, k, v = q_mapping(seq), k_mapping(seq), v_mapping(seq)

                attention = self.softmax(q @ k.T / (self.d_head ** 0.5))
                seq_result.append(attention @ v)
            result.append(torch.hstack(seq_result))
        return torch.cat([torch.unsqueeze(r, dim=0) for r in result])


In [12]:
class MyViTBlock(nn.Module):
    def __init__(self, mlp_ratio=4, config = None):
        super(MyViTBlock, self).__init__()

        assert config is not None, "Config dictionary must be provided"
        
        self.hidden_layer_dims = config["hidden_layer_dim"]
        self.n_heads = config["n_heads"]
        self.n_patches = config["n_patches"]
        self.ensemble_shape = config["ensemble_shape"]

        self.norm1 = nn.ModuleList([
            nn.LayerNorm(hidden_dim)
            for hidden_dim in self.hidden_layer_dims
        ])
        self.mhsa = MyMSA(config)
        self.norm2 = nn.ModuleList([
            nn.LayerNorm(hidden_dim)
            for hidden_dim in self.hidden_layer_dims
        ])

        self.dropout = nn.Dropout(.25)
        self.mlp = nn.ModuleList([
                nn.Sequential(
                    nn.Linear(hidden_dim, mlp_ratio* hidden_dim),  # MLP ratio fixed at 4
                    nn.GELU(),
                    nn.Linear(4 * hidden_dim, hidden_dim),
                )
            for hidden_dim in self.hidden_layer_dims
        ])

    def forward(self, x, ensemble_idx):
        n_patches_idx, hidden_layer_dim_idx = ensemble_idx
        norm1 = self.norm1[hidden_layer_dim_idx]
        norm2 = self.norm2[hidden_layer_dim_idx]
        mlp = self.mlp[hidden_layer_dim_idx]
        
        # Apply LayerNorm
        x_norm1 = norm1(x)

        # Multi-Head Self-Attention
        x_mhsa = self.mhsa(x_norm1, ensemble_idx)
        
        # Residual connection
        out = x + x_mhsa

        # Apply second LayerNorm
        x_norm2 = norm2(out)

        x_norm2 = self.dropout(x_norm2)

        # Apply MLP
        x_mlp = mlp(x_norm2)

        # Second residual connection
        out = out + x_mlp
        return out

In [13]:
n_blocks = 3

Having several heads per layer is similar to having several kernels in convolution.

Having several heads per layer allows one model to try out several pathways at once.

In [14]:
# n_blocks is the number of transformer blocks that this model want to include. here we will train with 3 blocks. 

class MyViT(nn.Module):
    def __init__(self, chw, config, ensemble_idx, labels, n_blocks=3, n_heads=3):
        # Super constructor
        super(MyViT, self).__init__()
        
        # Attributes
        self.chw = chw # ( C , H , W )
        self.config = config
        self.ensemble_idx = ensemble_idx
        self.n_blocks = n_blocks
        self.n_heads = n_heads
        self.hidden_layer_dims = config["hidden_layer_dim"]

        n_patches_idx, hidden_layer_dim_idx = ensemble_idx

        # Retrieve hidden_dim based on the ensemble configuration
        self.hidden_d = self.hidden_layer_dims[hidden_layer_dim_idx]

        # Dynamically determine the number of output classes
        self.out_d = get_out_d(labels)
        
        # 1) PreViT: Prepare Data (Tokenization, Positional Embeddings, and Classification Token)
        self.previt = PreViT(chw=chw, config=config)
        
        # 2) Transformer Encoder Blocks
        self.blocks = nn.ModuleList([
            MyViTBlock(config = config) for _ in range(n_blocks)
        ])
        
        # 3) Classification MLP
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_d, self.out_d),
            nn.Softmax(dim=-1)
        )

    def forward(self, images):
        """
        Args:
            images: Input tensor of shape (batch_size, C, H, W)
        """
        # PreViT: Tokenize input and add positional embeddings
        tokens = self.previt(images, self.ensemble_idx)
        
        # Transformer Blocks: Process the tokens
        for block in self.blocks:
            tokens = block(tokens, self.ensemble_idx)

        # Classification Token: Take the first token
        cls_token = tokens[:, 0]
        
        return self.mlp(cls_token) # Map to output dimension, output category distribution

In [32]:
# # Initialize the MyViT models
# config = {
#     "seed": 0, 
#     "device": "mps", 
#     "features_dtype": torch.float32,
#     "hidden_layer_dim" : 18, 
#     "ensemble_shape": (3,5), 
#     "n_patches": [2, 4, 8],  # 3 values for n_patches
#     "hidden_layer_dim": [12, 15, 18, 21, 24], # 5 values for hidden layer dimensions ( they are all divisible by 3 = n_head
#     "n_heads": 3
# }
# ensemble_idx = (1,2)
# chw = (9,8,8)
# model = MyViT(chw=chw, ensemble_idx=ensemble_idx, labels=labels_train, n_blocks=3, n_heads=3, config = config)

# # Forward pass
# output = model(features_train)

In [16]:
# config = {
#     "seed": 0, 
#     "device": "mps", 
#     "features_dtype": torch.float32,
#     "ensemble_shape": (3,5), 
#     "n_patches": [2, 4, 8],  # 3 values for n_patches
#     "hidden_layer_dim": [12, 15, 30, 21, 24], # 5 values for hidden layer dimensions ( they are all divisible by 3 = n_head
#     "n_heads": 3
# }
# # Iterate over all combinations of the ensemble configurations
# for n_patches_idx in range(config["ensemble_shape"][0]):  
#     for hidden_layer_dim_idx in range(config["ensemble_shape"][1]):  
#         # Define ensemble index
#         ensemble_idx = (n_patches_idx, hidden_layer_dim_idx)
#         chw = (9, 8, 8)
#         # Initialize the model for the current ensemble configuration
#         model = MyViT(
#             chw=chw,
#             config=config,
#             ensemble_idx=ensemble_idx,
#             labels=labels_train,
#             n_blocks=3,
#             n_heads=3
#         )
        
#         # Forward pass
#         output = model(features_train)
#         print(output.shape)

In [None]:

# Loading data

config = {
    "seed": 0, 
    "device": "cuda:0", 
    "features_dtype": torch.float32,
    "ensemble_shape": (1,1), 
    "n_patches": [1], 
      # 3 values for n_patches
    "hidden_layer_dim": [36],
      # 5 values for hidden layer dimensions 
      # ( they are all divisible by 3 = n_head
    "n_heads":3,
    "training_steps": 100,
    "minibatch_size": 128
    }

chess_features, chess_labels = torch.load('data/dataset.pt', map_location=config["device"])

features_train, features_valid, labels_train, labels_valid = train_test_split(
chess_features, chess_labels[:,0], test_size=0.2, random_state=42)

train_loader = get_dataloader_random_reshuffle(config, features_train, labels_train)
valid_loader = get_dataloader_random_reshuffle(config, features_valid, labels_valid)


# train_dataset = TensorDataset(features_train, labels_train)
# train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

# valid_dataset = TensorDataset(features_valid, labels_valid)
# valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
# Defining model and training options
        
device = config["device"]
N_EPOCHS = 5
LR = 0.001
# Here I want to find the best model out of the 15 models 
best_model = None
best_loss = float('inf')

# Training loop
for n_patches_idx in range(config["ensemble_shape"][0]):  
    for hidden_layer_dim_idx in range(config["ensemble_shape"][1]):  
        ensemble_idx = (n_patches_idx, hidden_layer_dim_idx)
        chw = (9, 8, 8)

        model = MyViT(
            chw=chw,
            config=config,
            ensemble_idx=ensemble_idx,
            labels=labels_train,
            n_blocks=3,
            n_heads=3).to(device)
        

        optimizer = Adam(model.parameters(), lr=LR)
        criterion = CrossEntropyLoss()

        
        for epoch in trange(N_EPOCHS, desc="Training"):
            train_loss = 0.0
            model.train()
            for step in trange(config['training_steps'], desc=f"Epoch {epoch + 1} in training", leave=False):
                features_train, labels_train = next(train_loader)
                x = features_train[0,0].to(device)
                y = labels_train[0, 0].to(device).to(torch.long)


                y_hat = model(x)
                # base = 100
                # y_1dim = y[:, 0] * base + y[:, 1]
                # y_1dim = y_1dim.to(torch.long)
                        
                loss = criterion(y_hat, y)

                train_loss = loss
                # train_loss += loss.detach().cuda().item() / len(features_train)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:}")

            # Validation phase
            val_loss = 0.0
            correct = 0
            total = 0
            model.eval()
            with torch.no_grad():
                # for batch in valid_loader:
                features_valid, labels_valid = next(valid_loader)
                x = features_valid[0,0].to(device)
                y = labels_valid[0,0].to(device).to(torch.long)

                y_hat = model(x)
                # base = 100
                # y_1dim = y[:, 0] * base + y[:, 1]  
                # loss = criterion(y_hat, y_1dim)

                val_loss += loss.item() / len(features_valid)
                _, predicted = torch.max(y_hat, 1)
                correct += (predicted == y).sum().item()
                total += y.size(0)

                print(f"Validation Loss for model {ensemble_idx}: {val_loss}")
                print(f"Validation Accuracy: {100 * correct / total}%")

            # Check if this model is the best
            if val_loss < best_loss:
                best_loss = val_loss
                best_model_state = model.state_dict()  # Save model state
                best_ensemble_idx = ensemble_idx  # Optionally track the configuration

print(f"Best model is from ensemble configuration with validation loss: {best_loss}")

  chess_features, chess_labels = torch.load('data/dataset.pt', map_location=config["device"])
  torch.tensor(get_positional_embeddings(n_patches ** 2 + 1, hidden_dim))


Training:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 in training:   0%|          | 0/100 [00:00<?, ?it/s]

Validation Loss for model (0, 0): 4.094013214111328
Validation Accuracy: 8.59375%


Epoch 2 in training:   0%|          | 0/100 [00:00<?, ?it/s]

Validation Loss for model (0, 0): 4.040075302124023
Validation Accuracy: 10.15625%


Epoch 3 in training:   0%|          | 0/100 [00:00<?, ?it/s]

Validation Loss for model (0, 0): 4.052342414855957
Validation Accuracy: 10.9375%


Epoch 4 in training:   0%|          | 0/100 [00:00<?, ?it/s]

Validation Loss for model (0, 0): 4.073372840881348
Validation Accuracy: 10.9375%


Epoch 5 in training:   0%|          | 0/100 [00:00<?, ?it/s]

Validation Loss for model (0, 0): 4.039544582366943
Validation Accuracy: 11.71875%
Best model is from ensemble configuration with validation loss: 4.039544582366943


In [None]:
model.load_state_dict(best_model_state)
# Test loop (Run the best model on the validation dataset)
model.eval()
val_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    features_train, labels_train = next(train_loader)
    x = features_train[0,0].to(device)
    y = labels_train[0, 0].to(device).to(torch.long)
    y_hat = model(x)
    # base = 100
    # y_1dim = y[:, 0] * base + y[:, 1]       
    # loss = criterion(y_hat, y_1dim)
    val_loss += loss.item() / len(features_valid)

        # Accuracy
    _, predicted = torch.max(y_hat, 1) #finding the best class
    correct += (predicted == y).sum().item()
    total += y.size(0)
print(f"Best Model Validation Loss: {val_loss:}")
print(f"Best Model Validation Accuracy: {100 * correct / total}%")

Best Model Validation Loss: 4.039544582366943
Best Model Validation Accuracy: 15.625%


: 