In [1]:
import json
from sparse_ffn.sparsity_types import SparsityType

# Baseline GPT

In [18]:
config = dict(

    # EMBEDDING PARAMETERS
    vocab_size              = 10_000,   # number of tokens in the vocabulary 
    hidden_size             = 256,      # embedding size (vector length) of each token 
    max_position_embeddings = 512,      # maximum sequence length (context window)

    # BLOCKS (ATTN & FFN)
    num_layers          = 2,                    # number of transformer blocks
    attention_types     = [[["global", "local"], 1]], # (GPT-Neo-specific) global and local attention 
    num_heads           = 4,                    # attention heads
    window_size         = 256,                  # (GPT-Neo-specific) for local attention 
    intermediate_size   = 256 * 16,                 # size of 'up-projection' layer in FFN

    pad_token_id = 0,           # need to specify this for tokenizer interop between models
)

with open("model_configs/gpt_baseline.json", "w") as outfile: 
    json.dump(config, outfile)


# Baseline Roberta

In [19]:

config = dict(
    
    # EMBEDDING PARAMETERS
    vocab_size              = 10_000,   
    hidden_size             = 256,      
    # we add 1 as RoBERTa uses a special position embedding for the padding token (zero vector)
    max_position_embeddings = 512 + 1,

    # BLOCKS (of course naming is different in roberta :) )
    num_hidden_layers = 2,
    num_attention_heads = 4,
    intermediate_size=256 * 16,                     

    pad_token_id = 0,
)

with open("model_configs/roberta_baseline.json", "w") as outfile: 
    json.dump(config, outfile)

# MOE

In [17]:
dim_in = 256
intermediate_factor = 16 # for baseline ffn, intermediate size intermediate factor * input dim
sparsity_type=SparsityType.MOE
num_experts = 4
topks= [('low', 3), ('medium',2), ('high',1)]
intermediate_size = round( (intermediate_factor * dim_in - num_experts) / num_experts )
print("intermediate layer size is", intermediate_size)

for pair in topks:
    st, topk = pair
    print(st, topk)
    config_gpt = dict(
    
        # number of tokens in the vocabulary 
        vocab_size = 10_000, 
        # embedding size (vector length) of each token 
        hidden_size=dim_in, 
        # we thus have an embedding block of 512 x 10'000 parameters
    
        # maximum sequence length, though inputs longer than `hidden_size` will be iteratively processed
        max_position_embeddings = 512, 
    
        # number of transformer blocks. div by 2 for attention_types
        num_layers=2, 
        # for global and local attention (GPT-Neo-specific)
        attention_types=[[["global", "local"], 1]], 
    
        num_heads=4,     # attention heads
        window_size=256, # for local attention (GPT-Neo-specific)
    
        sparsity_type=sparsity_type,
        num_experts=num_experts,
        topk=topk,
        intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
    )
    
    with open(f"model_configs/gpt_moe_{st}.json", "w") as outfile: 
        json.dump(config_gpt, outfile)

    
    config_rob = dict(
        
        # EMBEDDING PARAMETERS
        vocab_size              = 10_000,   
        hidden_size             = 256,      
        # we add 1 as RoBERTa uses a special position embedding for the padding token (zero vector)
        max_position_embeddings = 512 + 1,
    
        # BLOCKS (of course naming is different in roberta :) )
        num_hidden_layers = 2,
        num_attention_heads = 4,                   
        pad_token_id = 0,
    
        sparsity_type=sparsity_type,
        num_experts=num_experts,
        topk=topk,
        intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
    )
    with open(f"model_configs/roberta_moe_{st}.json", "w") as outfile: 
        json.dump(config_rob, outfile)

intermediate layer size is 1023
low 3
medium 2
high 1


# CNT

In [21]:
dim_in = 256
intermediate_factor = 16 # for baseline ffn, intermediate size intermediate factor * input dim
sparsity_type=SparsityType.CNT
sparsity_levels = [('high',16), ('medium',32), ('low',64)]



In [24]:
for pair in sparsity_levels:
    st, num_blocks = pair
    dim_lowrank = round(dim_in/num_blocks)
    intermediate_size = dim_in * (2 * intermediate_factor * num_blocks  - 1) / (2*num_blocks+1) 
    intermediate_size = num_blocks * round(intermediate_size/ num_blocks) # round to nearest multiple of num blocks
    print("intermediate layer size is", intermediate_size)

    config_gpt = dict(
    
        # number of tokens in the vocabulary 
        vocab_size = 10_000, 
        # embedding size (vector length) of each token 
        hidden_size=dim_in, 
        # we thus have an embedding block of 512 x 10'000 parameters
    
        # maximum sequence length, though inputs longer than `hidden_size` will be iteratively processed
        max_position_embeddings = 512, 
    
        # number of transformer blocks. div by 2 for attention_types
        num_layers=2, 
        # for global and local attention (GPT-Neo-specific)
        attention_types=[[["global", "local"], 1]], 
    
        num_heads=4,     # attention heads
        window_size=256, # for local attention (GPT-Neo-specific)
    
        sparsity_type=sparsity_type,
        num_blocks = num_blocks,
        dim_lowrank = dim_lowrank,
        intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
    )
    with open(f"model_configs/gpt_cnt_{st}.json", "w") as outfile: 
        json.dump(config_gpt, outfile)

    
    config_rob = dict(
        
        # EMBEDDING PARAMETERS
        vocab_size              = 10_000,   
        hidden_size             = 256,      
        # we add 1 as RoBERTa uses a special position embedding for the padding token (zero vector)
        max_position_embeddings = 512 + 1,
    
        # BLOCKS (of course naming is different in roberta :) )
        num_hidden_layers = 2,
        num_attention_heads = 4,                   
        pad_token_id = 0,
    
        sparsity_type=sparsity_type,
        num_blocks = num_blocks,
        dim_lowrank = dim_lowrank,
        intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
    )

    with open(f"model_configs/roberta_cnt_{st}.json", "w") as outfile: 
        json.dump(config_rob, outfile)

intermediate layer size is 3968
intermediate layer size is 4032
intermediate layer size is 4032


# PKM

In [25]:
dim_in = 256
intermediate_factor = 4
intermediate_size = dim_in * intermediate_factor
sparsity_type=SparsityType.PKM
dim_key = intermediate_size/2
num_query_heads = 4
num_subkeys = -(num_query_heads * dim_key / dim_in) + ((num_query_heads * dim_key / dim_in)**2 - 2 * (num_query_heads * dim_key - 16 * dim_in) )**0.5
num_subkeys = round(num_subkeys)
sparsity_levels = [('high', round(0.25 * num_subkeys)), ('medium', round(0.5 * num_subkeys)), ('low', round(0.75 * num_subkeys))]

print("number of subkeys", num_subkeys)


number of subkeys 56


In [26]:
for pair in sparsity_levels:
    st, topk = pair
    config_gpt = dict(
    
        # number of tokens in the vocabulary 
        vocab_size = 10_000, 
        # embedding size (vector length) of each token 
        hidden_size=dim_in, 
        # we thus have an embedding block of 512 x 10'000 parameters
    
        # maximum sequence length, though inputs longer than `hidden_size` will be iteratively processed
        max_position_embeddings = 512, 
    
        # number of transformer blocks. div by 2 for attention_types
        num_layers=2, 
        # for global and local attention (GPT-Neo-specific)
        attention_types=[[["global", "local"], 1]], 
    
        num_heads=4,     # attention heads
        window_size=256, # for local attention (GPT-Neo-specific)
    
        sparsity_type=sparsity_type,
        num_query_heads = num_query_heads,
        num_subkeys = num_subkeys,
        topk = topk,
        intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
    )
    with open(f"model_configs/gpt_pkm_{st}.json", "w") as outfile: 
        json.dump(config_gpt, outfile)


    config_rob = dict(
        
        # EMBEDDING PARAMETERS
        vocab_size              = 10_000,   
        hidden_size             = 256,      
        # we add 1 as RoBERTa uses a special position embedding for the padding token (zero vector)
        max_position_embeddings = 512 + 1,
    
        # BLOCKS (of course naming is different in roberta :) )
        num_hidden_layers = 2,
        num_attention_heads = 4,                   
        pad_token_id = 0,
    
        sparsity_type=sparsity_type,
        num_query_heads = num_query_heads,
        num_subkeys = num_subkeys,
        topk = topk,
        intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
    )
    with open(f"model_configs/roberta_pkm_{st}.json", "w") as outfile: 
        json.dump(config_rob, outfile)