# CFG
提出问题：是否存在一种让我们理解语言模型如何完成复杂任务（涉及深层逻辑/推理/计算链）的设置？
* 建议使用上下文无关语法（CFG）合成语言
* CFG包括终端符号T，非终端符号NT，根符号以及产生规则，可以层次化地产生高度结构化的表达式

文中的规则
![rules](./learn_CFGs/rules.png)

In [None]:
import random

class CFG:
    def __init__(self, rules, start_symbol):
        self.rules = rules
        self.start_symbol = start_symbol

    def generate(self, symbol=None):
        # 没有就直接从root开始
        if symbol is None:
            symbol = self.start_symbol
        # 如果当前symbol不在我们的rules.keys中，那便到了结尾
        if symbol not in self.rules:
            return symbol
        else:
            # 这里是正常的，就是按照生成规则随机产生一个
            rule = random.choice(self.rules[symbol])
            return ''.join(self.generate(sym) for sym in rule)
    
# 提取的生成规则
rules = {
    'root': [['20', '21'], ['20', '19', '21'], ['21', '19', '19']],
    '20': [['16', '16'], ['16', '17'], ['17', '16', '18']],
    '21': [['18', '17'], ['17', '16'], ['16', '17', '18'], ['16', '18']],
    '16': [['15', '15'], ['13', '15', '13'], ['14', '13'], ['14', '14']],
    '15': [['10', '11', '11'], ['11', '11', '10'], ['10', '10'], ['12', '12', '11']],
    '14': [['10', '12'], ['12', '10', '12'], ['12', '11'], ['10', '12', '12']],
    '13': [['11', '12'], ['12', '11', '12'], ['10', '12', '11']],
    '12': [['7', '9', '7'], ['9', '8'], ['8', '8', '9']],
    '11': [['8', '8'], ['9', '7'], ['9', '7', '7']],
    '10': [['8', '9', '9'], ['9', '7', '9'], ['7', '9', '9']],
    '9': [['1', '2', '1'], ['3', '3'], ['1', '1']],
    '8': [['3', '1', '1'], ['1', '2'], ['3', '3', '1']],
    '7': [['2', '2', '1'], ['3', '2', '2'], ['3', '1', '2'], ['3', '2']]
}

cfg = CFG(rules, 'root')

# 生成句子
for _ in range(20):
    print(cfg.generate())

31112112132233322322121322331311311331331311333322133332213332211322331311111912132111213211312331133133111718
1732121322113212131232233133111183211322331311123311211212112111322321122132113313313111131118
1132211123312111321112122112117181919
18171919
1711221311121212211213331112133121322121113111233118193221111323331231133111331331111121718
121331333223113113331211333313313313113221132211221323223322117191817
173311212131233333121213121112123313233321832211113221131211221322112213331131133312113123331233322113223121718
121211121323223111233121312312121323111213311311121312322111217181919
173121133311331111132221123311233132211121123313312122211212211819171212111111233112111111233133111
1733112111213223121132211221221311311111919
1122133321132333222213332113232231111111132211311331111213217191817
332211213322112131131133312121111171917311333333331113121133131111322121322
173311212132233111133112111213223331218113311233121322121221312113221212331331331718
312121121121331123311333113332

In [1]:
from transformers import GPT2Config, GPT2LMHeadModel

# Define GPT-2 configuration
gpt2_config = GPT2Config(
    n_layer=12,         # Number of layers
    n_head=12,          # Number of attention heads
    n_embd=768          # Hidden dimensions
)

# Initialize GPT-2 model
gpt2_model = GPT2LMHeadModel(gpt2_config)

# Print model summary
print(gpt2_model)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [2]:
from transformers import DebertaConfig, DebertaModel

# Define DeBERTa configuration
deberta_config = DebertaConfig(
    hidden_size=768,    # Hidden dimensions
    num_attention_heads=12,  # Number of attention heads
    num_hidden_layers=12     # Number of layers
)

# Initialize DeBERTa model
deberta_model = DebertaModel(deberta_config)

# Print model summary
print(deberta_model)

DebertaModel(
  (embeddings): DebertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): DebertaLayerNorm()
    (dropout): StableDropout()
  )
  (encoder): DebertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaLayer(
        (attention): DebertaAttention(
          (self): DisentangledSelfAttention(
            (in_proj): Linear(in_features=768, out_features=2304, bias=False)
            (dropout): StableDropout()
          )
          (output): DebertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): DebertaLayerNorm()
            (dropout): StableDropout()
          )
        )
        (intermediate): DebertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): DebertaOutput(
          (dense): Linear(in_features=3072, 