# Attention Transformer NN for King Heritage Data

## Run 16

`Test_32*-Test_34*`

Same as Train_14, but with following changes:

- Train on all possible indices aka 2503 inds
- Data fixed so that first row/last col are NOT deleted and instead the filled matrix filling is done properly w.r.t. the indices
- Much bigger dataset, due to more indices being trained on, and also wanting more representations for each indice 
- Much bigger batch size, from having bigger dataset
- Slightly larger learning rate, `0.0001` -> `0.0003`
- Slightly larger embed size, 120 -> 200

Same max sequence length, validation %, padval. 

## Setup

In [1]:
#!conda activate jupyter_env
#!pip install -r "../requirements.txt"
# !pip install gputil

In [3]:
## Import meta setup

# In order to force reload any changes done to the models package files
%load_ext autoreload
%autoreload 2

# Allow import from our custom lib python files
import sys
import os

# module_path = os.path.abspath(os.path.join('../'))
module_path = os.path.abspath(os.path.join('../src/'))
if module_path not in sys.path:
    sys.path.append(module_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Fix for working in TLJH context

Issue with multithreading spawning with the jupyter hub setup (The Littlest Jupyter Hub on Paperspace) and working with torch, and with the dataloader multithread loading. See the following for issue discussion and solution: 

https://github.com/pytorch/pytorch/issues/40403#issuecomment-1704178443

In [3]:
import torch.multiprocessing as mp 
mp.set_start_method('spawn')

In [30]:
import os
import json


from lib.params import * # device, use_cuda, Checkpoint, various saving strs
from lib.datasets import TokenizedKingDataset, TokenizedCollateFn
from lib.models import TokenizedInputTransformer
from lib.saveload import *
from lib.training import train_model_tokenized, tokenized_masked_loss
import lib.notebook_utils as custom_info

import dill
import pandas as pd
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, random_split, Subset
import numpy as np

### Debug Machine Info

In [5]:
custom_info.print_python_info()
custom_info.print_imports(globals())
custom_info.print_machine_info()

Current Python executable: /opt/tljh/user/bin/python	3.10.10 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:08:06) [GCC 11.3.0]
Current Directory: /home/jupyter-bhavana/Notebooks
sys==Python BuiltIn
torch==2.1.0
pandas==2.0.0
numpy==1.24.4
os==unknown
json==unknown
datetime==unknown
lib==unknown
System: Linux
Node Name: psapmaq8sxf1
Release: 5.15.0-124-generic
Version: #134-Ubuntu SMP Fri Sep 27 20:20:17 UTC 2024
Machine: x86_64
Processor: x86_64
Physical cores: 8
Total cores: 8
Max Frequency: 0.00Mhz
Min Frequency: 0.00Mhz
Current Frequency: 3202.58Mhz
CPU Usage Per Core:
Core 0: 0.0%
Core 1: 0.0%
Core 2: 0.0%
Core 3: 0.0%
Core 4: 0.0%
Core 5: 0.0%
Core 6: 0.0%
Core 7: 0.0%
Total CPU Usage: 0.7%
Total: 44.07GB
Available: 42.53GB
Used: 1.07GB
Percentage: 3.5%
Total: 0.00B
Free: 0.00B
Used: 0.00B
Percentage: 0.0%
Partitions and Usage:
=== Device: /dev/mapper/ubuntu--vg-root ===
  Mountpoint: /
  File system type: ext4
  Total Size: 982.45GB
  Used: 170.79GB
  Free: 771.63GB


## Global parameters

In [33]:
runname = "test16"
machine = "Paperspace"
datapath = "../Data/king_matrix.csv"
outdir = os.path.join("../Output/Runs/", runname)
tensorboard_dir = "../Output/Tensorboard"
SEED = 42

if not os.path.exists(outdir):
        os.mkdir(outdir)
if not os.path.exists(tensorboard_dir):
    os.mkdir(tensorboard_dir)

print(f"Using {device} device")

Using cpu device


## Load Data


In [31]:
dsize = 40000
vstart = int(dsize * 0.8)
maxseqlen = 100
maxind = 2502
padval = -1
batchsize = 150

dataset = TokenizedKingDataset(datapath, filled=False, normalize=True, padval=padval, dsize=dsize, maxseqlen=maxseqlen, maxind=maxind, remove_starts=False)
dataset_filled = TokenizedKingDataset(datapath, filled=True, normalize=True, padval=padval, dsize=dsize, maxseqlen=maxseqlen, maxind=maxind, remove_starts=False)

# Don't need to random sample to make subsets since they're already pretty random, and want even distribution
# of representative indices in both train/test
train, test = Subset(dataset, range(vstart)), Subset(dataset, range(vstart, dsize))
train_filled, test_filled = Subset(dataset_filled, range(vstart)), Subset(dataset_filled, range(vstart, dsize))

dl_args = dict(batch_size=batchsize, shuffle=True, num_workers=6, collate_fn=TokenizedCollateFn(dataset.padind, dataset.padval).collate_fn)
train_dataloader, test_dataloader = DataLoader(train, **dl_args), DataLoader(test, **dl_args)
train_filled_dataloader, test_filled_dataloader = DataLoader(train_filled, **dl_args), DataLoader(test_filled, **dl_args)


print(dataset_filled.X, dataset.X)
print(len(dataset), len(dataset[-1][0]), dataset[0][0].shape, dataset[0][1].shape)
print(len(train_filled), len(train_filled[0][0]), train_filled[2501][0].shape, train_filled[2501])
# print(next(iter(test_filled_dataloader)))

All (shuffled) inds are repeated 818 times to get full dataset.
All (shuffled) inds are repeated 818 times to get full dataset.
tensor([[0.7205, 0.7139, 0.7188,  ..., 0.7002, 0.6957, 0.6951],
        [0.7139, 0.7205, 0.7202,  ..., 0.7157, 0.7259, 0.7094],
        [0.7188, 0.7202, 0.7205,  ..., 0.6952, 0.6962, 0.6999],
        ...,
        [0.7002, 0.7157, 0.6952,  ..., 0.7205, 0.7323, 0.7135],
        [0.6957, 0.7259, 0.6962,  ..., 0.7323, 0.7205, 0.7337],
        [0.6951, 0.7094, 0.6999,  ..., 0.7135, 0.7337, 0.7205]]) tensor([[0.7205, 0.7139, 0.7188,  ..., 0.7002, 0.6957, 0.6951],
        [0.7205, 0.7205, 0.7202,  ..., 0.7157, 0.7259, 0.7094],
        [0.7205, 0.7205, 0.7205,  ..., 0.6952, 0.6962, 0.6999],
        ...,
        [0.7205, 0.7205, 0.7205,  ..., 0.7205, 0.7323, 0.7135],
        [0.7205, 0.7205, 0.7205,  ..., 0.7205, 0.7205, 0.7337],
        [0.7205, 0.7205, 0.7205,  ..., 0.7205, 0.7205, 0.7205]])
40000 5 torch.Size([2]) torch.Size([2, 100])
32000 2 torch.Size([28]) (tenso

## Create Model(s)

All use tokenized input ("_t_")

32. Main model; one head
33. 2 heads attention
34. Non-filled input matrix

In [35]:
# Using random amt a bit greater than maxseqlen; and d_model = embed_size
d_model = 120

model_names = ["Test_32_t_f_1h", "Test_33_t_f_2h", "Test_34_t_1h"]
base_params = dict(d_model=d_model,
                    num_encoder_layers=3,
                    num_decoder_layers=2,
                    dim_feedforward=512,
                    activation=nn.Tanh(),
                    use_pe=True,
                    dropout_pe=0.0,
                    maxseqlen=maxseqlen, 
                    maxind=maxind
                  )

run_details = {"run_params": dict(
                    machine=machine,
                    epochs = 100,
                    checkpoint_at = 20,
                    load=False,
                    batch_pr=int(dsize / batchsize / 5), # Print/validate every 1/5 of epoch
                    runname=runname
                    ),
                model_names[0]: dict(
                    num_head=1,
                    name=model_names[0],
                    ) | base_params,
                model_names[1]: dict(
                    num_head=2,
                    name=model_names[1],
                    ) | base_params,
                model_names[2]: dict(
                    num_head=1,
                    name=model_names[2],
                    ) | base_params,
                }
models = [TokenizedInputTransformer(**run_details[m]).to(device) for m in model_names]

assert models[0].padind == dataset_filled.padind

print(models)

# Save details
with open(os.path.join(outdir, f"details_{runname}.json"), "w" ) as write:
    json.dump(run_details, write, indent=2, default=lambda x: f"nn.{x.__class__.__name__}")

[TokenizedInputTransformer(
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=120, out_features=120, bias=True)
        )
        (linear1): Linear(in_features=120, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=120, bias=True)
        (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (activation): Tanh()
      )
    )
    (norm): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x Transform

### Test model

In [22]:
# tstdata = next(iter(train_filled_dataloader))
# f = models[0](tstdata[0], tstdata[2])
# print(f.shape, f)

torch.Size([20, 98, 100]) tensor([[[ 0.5424, -0.5737,  0.3865,  ...,  0.3032, -0.1889, -0.0847],
         [ 0.2757,  0.7767, -0.7470,  ...,  0.9186,  0.5960,  0.9575],
         [-0.4169, -0.8034,  0.1595,  ...,  0.5477, -0.4676,  0.0498],
         ...,
         [-0.6248,  0.0357,  0.1504,  ...,  1.4623,  0.3608,  0.1969],
         [-0.0325, -0.0228,  0.3965,  ...,  0.9481,  0.1958, -0.7405],
         [-0.0075,  0.1558,  0.2931,  ...,  1.7159,  0.1304, -0.2127]],

        [[ 0.2671,  0.4824,  0.1180,  ..., -0.0022, -1.0225,  0.2591],
         [ 0.1399,  0.2514, -0.0843,  ...,  1.0780, -0.2568,  0.6659],
         [-0.4793, -0.0486, -0.1473,  ...,  0.2168,  0.1735, -0.0826],
         ...,
         [-0.4887, -0.3468, -0.7329,  ...,  0.5380, -0.2615,  0.4388],
         [ 0.6219, -0.4193, -0.5936,  ...,  0.0726, -0.0848,  0.2610],
         [ 0.3732, -0.1915, -1.0530,  ...,  0.0859,  0.1695,  0.5922]],

        [[-0.4227,  0.8049,  0.6250,  ...,  1.0032, -0.1554,  0.0846],
         [ 0.3641, 

## Train the Model(s)

In [36]:
# %%capture cap --no-stderr

loss_fcn = tokenized_masked_loss

for model, train_d, test_d in zip(models, 
                                  (train_filled_dataloader, train_filled_dataloader, train_dataloader), 
                                  (test_filled_dataloader, test_filled_dataloader, test_dataloader)):

    writer = SummaryWriter(os.path.join(tensorboard_dir, f'{machine}_{model.get_name()}_{runname}'))
    # Set foreach=False to avoid OOM
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, foreach=False)
    
    train_model_tokenized(model=model,
                optimizer=optimizer,
                train_data=train_d,
                validate_data=test_d,
                loss_fcn=loss_fcn,
                padval=dataset_filled.padval,
                output_run_dir=outdir,
                **run_details["run_params"],
                writer=writer,
                output_onnx=False
            )
    
    
    writer.close()


Training Test_32_t_f_1h


  return torch._native_multi_head_attention(


[0, 53] loss: 0.09224236686274691, validation loss: 0.004492702721445648, average train time (sec): 0.02510899011319435


KeyboardInterrupt: 