# Model study

In [1]:
import sys
from pathlib import Path

# Modify this as you wish, used for importing custom project packages, such as `utils`
project_path = str(Path.home()) + "\\Projects\\rlgym_demo"

sys.path.insert(0, project_path)

In [2]:
import torch
from utils.algorithms import DeviceAlternatingPPO

In [3]:
model = DeviceAlternatingPPO.load("../models/NectoTest_Perceiver/model_417280000_steps.zip")

In [6]:
actor = model.policy.mlp_extractor.actor
critic = model.policy.mlp_extractor.critic

## Actor

### Query

In [18]:
actor_query_params = list(actor.query_preprocess.parameters())

In [19]:
actor_query_params[0].shape

torch.Size([128, 32])

In [20]:
actor_query_params[1].shape

torch.Size([128])

In [24]:
actor_query_params[0].min()

tensor(-0.8223, device='cuda:0', grad_fn=<MinBackward1>)

In [23]:
actor_query_params[0].max()

tensor(0.7199, device='cuda:0', grad_fn=<MaxBackward1>)

### KV

In [25]:
actor_kv_params = list(actor.kv_preprocess.parameters())

In [26]:
actor_kv_params[0].shape

torch.Size([128, 24])

In [27]:
actor_kv_params[1].shape

torch.Size([128])

In [28]:
actor_kv_params[0].min()

tensor(-0.6827, device='cuda:0', grad_fn=<MinBackward1>)

In [29]:
actor_kv_params[0].max()

tensor(0.6734, device='cuda:0', grad_fn=<MaxBackward1>)

### Blocks

In [31]:
actor_blocks = actor.perceiver_blocks

#### Block 0

In [37]:
actor_blocks[0]

PerceiverBlock(
  (cross_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (linear1): Linear(in_features=128, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=128, bias=True)
  (activation): ReLU()
)

##### CA

In [52]:
actor_b0_ca = actor_blocks[0].cross_attention

In [56]:
actor_b0_ca_params = list(actor_b0_ca.parameters())

In [57]:
actor_b0_ca_params[0].shape

torch.Size([384, 128])

In [62]:
actor_b0_ca_params[1].shape

torch.Size([384])

In [58]:
actor_b0_ca_params[0].min()

tensor(-0.5701, device='cuda:0', grad_fn=<MinBackward1>)

In [59]:
actor_b0_ca_params[0].max()

tensor(0.5690, device='cuda:0', grad_fn=<MaxBackward1>)

In [60]:
actor_b0_ca_params[1].min()

tensor(-0.3604, device='cuda:0', grad_fn=<MinBackward1>)

In [61]:
actor_b0_ca_params[1].max()

tensor(0.4111, device='cuda:0', grad_fn=<MaxBackward1>)

In [63]:
actor_b0_ca_params[2].shape

torch.Size([128, 128])

In [64]:
actor_b0_ca_params[2].min()

tensor(-0.6607, device='cuda:0', grad_fn=<MinBackward1>)

In [65]:
actor_b0_ca_params[2].max()

tensor(0.6100, device='cuda:0', grad_fn=<MaxBackward1>)

In [66]:
actor_b0_ca_params[3].min()

tensor(-0.1318, device='cuda:0', grad_fn=<MinBackward1>)

In [67]:
actor_b0_ca_params[3].max()

tensor(0.0228, device='cuda:0', grad_fn=<MaxBackward1>)

##### Linear1

In [70]:
actor_b0_l1_params = list(actor_blocks[0].linear1.parameters())

In [74]:
actor_b0_l1_params[0].shape

torch.Size([128, 128])

In [76]:
actor_b0_l1_params[0].min()

tensor(-0.6955, device='cuda:0', grad_fn=<MinBackward1>)

In [77]:
actor_b0_l1_params[0].max()

tensor(0.6021, device='cuda:0', grad_fn=<MaxBackward1>)

##### Linear2

In [78]:
actor_b0_l2_params = list(actor_blocks[0].linear2.parameters())

In [79]:
actor_b0_l2_params[0].shape

torch.Size([128, 128])

In [81]:
actor_b0_l2_params[0].min()

tensor(-0.6539, device='cuda:0', grad_fn=<MinBackward1>)

In [82]:
actor_b0_l2_params[0].max()

tensor(0.6930, device='cuda:0', grad_fn=<MaxBackward1>)

In [83]:
actor_b0_l2_params[1].max()

tensor(0.0906, device='cuda:0', grad_fn=<MaxBackward1>)

In [84]:
actor_b0_l2_params[1].min()

tensor(-0.1285, device='cuda:0', grad_fn=<MinBackward1>)

#### Block 1

In [85]:
actor_blocks[0]

PerceiverBlock(
  (cross_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (linear1): Linear(in_features=128, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=128, bias=True)
  (activation): ReLU()
)

##### CA

In [86]:
actor_b1_ca = actor_blocks[1].cross_attention

In [87]:
actor_b1_ca_params = list(actor_b1_ca.parameters())

In [88]:
actor_b1_ca_params[0].shape

torch.Size([384, 128])

In [89]:
actor_b1_ca_params[1].shape

torch.Size([384])

In [90]:
actor_b1_ca_params[0].min()

tensor(-0.5180, device='cuda:0', grad_fn=<MinBackward1>)

In [91]:
actor_b1_ca_params[0].max()

tensor(0.5331, device='cuda:0', grad_fn=<MaxBackward1>)

In [92]:
actor_b1_ca_params[1].min()

tensor(-0.2598, device='cuda:0', grad_fn=<MinBackward1>)

In [93]:
actor_b1_ca_params[1].max()

tensor(0.3051, device='cuda:0', grad_fn=<MaxBackward1>)

In [94]:
actor_b1_ca_params[2].shape

torch.Size([128, 128])

In [95]:
actor_b1_ca_params[2].min()

tensor(-0.5312, device='cuda:0', grad_fn=<MinBackward1>)

In [96]:
actor_b1_ca_params[2].max()

tensor(0.6229, device='cuda:0', grad_fn=<MaxBackward1>)

In [97]:
actor_b1_ca_params[3].min()

tensor(-0.1230, device='cuda:0', grad_fn=<MinBackward1>)

In [98]:
actor_b1_ca_params[3].max()

tensor(0.0861, device='cuda:0', grad_fn=<MaxBackward1>)

##### Linear1

In [99]:
actor_b1_l1_params = list(actor_blocks[1].linear1.parameters())

In [100]:
actor_b1_l1_params[0].shape

torch.Size([128, 128])

In [101]:
actor_b1_l1_params[0].min()

tensor(-0.6809, device='cuda:0', grad_fn=<MinBackward1>)

In [102]:
actor_b1_l1_params[0].max()

tensor(0.6743, device='cuda:0', grad_fn=<MaxBackward1>)

##### Linear2

In [103]:
actor_b1_l2_params = list(actor_blocks[1].linear2.parameters())

In [104]:
actor_b1_l2_params[0].shape

torch.Size([128, 128])

In [105]:
actor_b1_l2_params[0].min()

tensor(-0.6620, device='cuda:0', grad_fn=<MinBackward1>)

In [106]:
actor_b1_l2_params[0].max()

tensor(0.7386, device='cuda:0', grad_fn=<MaxBackward1>)

In [107]:
actor_b1_l2_params[1].max()

tensor(0.0908, device='cuda:0', grad_fn=<MaxBackward1>)

In [108]:
actor_b1_l2_params[1].min()

tensor(-0.0910, device='cuda:0', grad_fn=<MinBackward1>)