In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
import torch.nn as nn

decoder_layer = model.model.layers[2]

for param in decoder_layer.parameters():
    nn.init.constant_(param, 0)

In [28]:
class IdentityLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.linear = nn.Linear(hidden_size, hidden_size)
        # 初始化为等价映射
        nn.init.eye_(self.linear.weight)
        nn.init.constant_(self.linear.bias, 0)

    def forward(self, x):
        return self.linear(x)
    
hidden_size = model.config.hidden_size

model.model.layers[1] = IdentityLayer(hidden_size)

print(model.model.layers[1])

IdentityLayer(
  (linear): Linear(in_features=4096, out_features=4096, bias=True)
)


In [27]:
for param in decoder_layer.parameters():
    print(param)

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)
Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)
Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)
Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0

In [29]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
      (1): IdentityL

In [30]:
del model.model.layers[30]

model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
      (1): IdentityL

In [31]:
class CNNLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.conv = nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size, kernel_size=3, padding=1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # 转换形状以适应 Conv1d 输入
        x = x.transpose(1, 2)
        x = self.conv(x)
        x = self.relu(x)
        # 转换回原始形状
        x = x.transpose(1, 2)
        return x
    
hidden_size = model.config.hidden_size
cnn_layer = CNNLayer(hidden_size)

model.model.layers.insert(26, cnn_layer)

In [32]:
model


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
      (1): IdentityL

In [34]:
import torch

# 将模型的前半部分移动到 GPU 6
device0 = torch.device('cuda:6')
model.model.embed_tokens = model.model.embed_tokens.to(device0)
for i in range(len(model.model.layers) // 2):
    model.model.layers[i] = model.model.layers[i].to(device0)

# 将模型的后半部分移动到 GPU 7
device1 = torch.device('cuda:7')
for i in range(len(model.model.layers) // 2, len(model.model.layers)):
    model.model.layers[i] = model.model.layers[i].to(device1)
model.model.norm = model.model.norm.to(device1)
model.lm_head = model.lm_head.to(device1)

# 修改前向传播函数
class SplitModel(nn.Module):
    def __init__(self, model, device0, device1):
        super().__init__()
        self.model = model
        self.device0 = device0
        self.device1 = device1

    def forward(self, input_ids, attention_mask=None):
        # 将输入移动到 device0
        input_ids = input_ids.to(self.device0)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.device0)

        # 前半部分在 device0 上计算
        hidden_states = self.model.model.embed_tokens(input_ids)
        for i in range(len(self.model.model.layers) // 2):
            hidden_states = self.model.model.layers[i](hidden_states, attention_mask=attention_mask)

        # 将中间结果移动到 device1
        hidden_states = hidden_states.to(self.device1)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.device1)

        # 后半部分在 device1 上计算
        for i in range(len(self.model.model.layers) // 2, len(self.model.model.layers)):
            hidden_states = self.model.model.layers[i](hidden_states, attention_mask=attention_mask)
        hidden_states = self.model.model.norm(hidden_states)
        logits = self.model.lm_head(hidden_states)

        return logits

# 创建 SplitModel 实例
split_model = SplitModel(model, device0, device1)

# 示例输入
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt")

# 生成输出
outputs = split_model(**inputs)

# 解码输出
output_text = tokenizer.decode(outputs.argmax(dim=-1)[0], skip_special_tokens=True)
print(output_text)

The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


AttributeError: 'NoneType' object has no attribute 'shape'

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM

# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

# 将模型的前半部分移动到 GPU 6
device0 = torch.device('cuda:6')
model.model.embed_tokens = model.model.embed_tokens.to(device0)
for i in range(len(model.model.layers) // 2):
    model.model.layers[i] = model.model.layers[i].to(device0)

# 将模型的后半部分移动到 GPU 7
device1 = torch.device('cuda:7')
for i in range(len(model.model.layers) // 2, len(model.model.layers)):
    model.model.layers[i] = model.model.layers[i].to(device1)
model.model.norm = model.model.norm.to(device1)
model.lm_head = model.lm_head.to(device1)


# 修改前向传播函数
class SplitModel(nn.Module):
    def __init__(self, model, device0, device1):
        super().__init__()
        self.model = model
        self.device0 = device0
        self.device1 = device1

    def forward(self, input_ids, attention_mask=None, position_embeddings=None):
        # 将输入移动到 device0
        input_ids = input_ids.to(self.device0)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.device0)
        if position_embeddings is not None:
            position_embeddings = (position_embeddings[0].to(self.device0), position_embeddings[1].to(self.device0))

        # 前半部分在 device0 上计算
        hidden_states = self.model.model.embed_tokens(input_ids)
        for i in range(len(self.model.model.layers) // 2):
            hidden_states = self.model.model.layers[i](hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)

        # 将中间结果移动到 device1
        hidden_states = hidden_states.to(self.device1)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.device1)
        if position_embeddings is not None:
            position_embeddings = (position_embeddings[0].to(self.device1), position_embeddings[1].to(self.device1))

        # 后半部分在 device1 上计算
        for i in range(len(self.model.model.layers) // 2, len(self.model.model.layers)):
            hidden_states = self.model.model.layers[i](hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)
        hidden_states = self.model.model.norm(hidden_states)
        logits = self.model.lm_head(hidden_states)

        return logits

# 创建 SplitModel 实例
split_model = SplitModel(model, device0, device1)

# 统计模型参数数量的函数
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# 打印参数数量
print(f"模型参数总数: {count_parameters(split_model)}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

模型参数总数: 8030261248


In [2]:
import torch

# 准备输入数据
input_text = "你好，世界！"
inputs = tokenizer(input_text, return_tensors="pt")

# 将输入数据移动到 device0
input_ids = inputs["input_ids"].to(device0)
attention_mask = inputs["attention_mask"].to(device0)

# 使用 SplitModel 进行推理
with torch.no_grad():
    outputs = split_model(input_ids, attention_mask=attention_mask)

# 获取预测的 token
predicted_token_ids = torch.argmax(outputs, dim=-1)

# 将预测的 token 转换为文本
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

# 打印输出结果
print(f"输入文本: {input_text}")
print(f"预测文本: {predicted_text}")

The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct").to("cuda:6")

# 准备输入数据
input_text = "给我写一个1000字的小故事。"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda:6")

# 不使用 KV-Cache 进行推理
model.config.use_cache = False
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=1000)
end_time = time.time()
no_cache_time = end_time - start_time
no_cache_memory = torch.cuda.max_memory_allocated()

# 打印不使用 KV-Cache 的生成结果
no_cache_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"不使用 KV-Cache 的生成结果: {no_cache_text}")

# 使用 KV-Cache 进行推理
model.config.use_cache = True
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=1000)
end_time = time.time()
cache_time = end_time - start_time
cache_memory = torch.cuda.max_memory_allocated()

# 打印使用 KV-Cache 的生成结果
cache_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"使用 KV-Cache 的生成结果: {cache_text}")

# 打印时间和显存使用结果
print(f"不使用 KV-Cache 的推理时间: {no_cache_time:.4f} 秒")
print(f"使用 KV-Cache 的推理时间: {cache_time:.4f} 秒")
print(f"不使用 KV-Cache 的显存使用: {no_cache_memory / 1024 ** 2:.2f} MB")
print(f"使用 KV-Cache 的显存使用: {cache_memory / 1024 ** 2:.2f} MB")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]