学术资源加速

以下为可以加速访问的学术资源地址：
- github.com
- githubusercontent.com
- githubassets.com
- huggingface.co

In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value
        

加载自定义llama模型并进行推理，测试内部运行逻辑

In [14]:
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
import math
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings, logging
import torch
import torch.nn.functional as F
from torch import nn
from typing import List, Optional, Tuple, Union
from transformers import AutoTokenizer, Cache, AutoTokenizer, AutoModelForCausalLM, LlamaConfig, add_start_docstrings, DynamicCache, StaticCache, BitsAndBytesConfig
from transformers.models.llama.modeling_llama import LlamaForCausalLM, LLAMA_INPUTS_DOCSTRING, _CONFIG_FOR_DOC, LlamaAttention, apply_rotary_pos_emb, repeat_kv, LlamaMLP, \
    LlamaFlashAttention2, LlamaSdpaAttention, LlamaDecoderLayer, LLAMA_START_DOCSTRING, LlamaPreTrainedModel, LlamaRMSNorm, LlamaModel


class TestLlamaAttention(LlamaAttention):
    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
        super().__init__(config, layer_idx)
        
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        
        print("cache_position", cache_position)
        print("position_ids", position_ids)
        
        bsz, q_len, _ = hidden_states.size()

        if self.config.pretraining_tp > 1:
            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
            query_states = torch.cat(query_states, dim=-1)

            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
            key_states = torch.cat(key_states, dim=-1)

            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
            value_states = torch.cat(value_states, dim=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        # print("query_states1", query_states.shape)
        # print("key_states1", key_states.shape)
        # print("value_states1", value_states.shape)

        cos, sin = self.rotary_emb(value_states, position_ids)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        if past_key_value is not None:
            # sin and cos are specific to RoPE models; cache_position needed for the static cache
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        # print("query_states2", query_states.shape)
        # print("key_states2", key_states.shape)
        # print("value_states2", value_states.shape)

        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        if attention_mask is not None:  # no matter the length, we just slice it
            # print("attention_mask", attention_mask[0, 0, :])
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            # print("causal_mask", causal_mask.shape)
            attn_weights = attn_weights + causal_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        # print("attn_weights", attn_weights)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
        else:
            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


class TestLlamaDecoderLayer(LlamaDecoderLayer):
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__(config, layer_idx)
        
        self.self_attn = TestLlamaAttention(config=config, layer_idx=layer_idx)


class TestLlamaModel(LlamaModel):

    def __init__(self, config: LlamaConfig):
        super().__init__(config)

        self.layers = nn.ModuleList(
            [TestLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )


class TestLlamaForCausalLM(LlamaForCausalLM):
    def __init__(self, config: LlamaConfig):
        super().__init__(config)
        
        self.model = TestLlamaModel(config)
        
    @torch.inference_mode()
    def forward_inference(self, 
                          tokens: torch.Tensor,
                          start_pos: int,
                          max_context_window: int,
                         ):
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        self.freqs_cis = self.freqs_cis.to(h.device)
        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]

        mask = self.mask[:seqlen, :seqlen]
        # When performing key-value caching, we compute the attention scores
        # only for the new sequence. Thus, the matrix of scores is of size
        # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
        # j > cache_len + i, since row i corresponds to token cache_len + i.
        mask = torch.hstack(
            [torch.zeros((seqlen, start_pos), device=tokens.device), mask]
        ).type_as(h)

        for layer in self.layers:
            h = layer(
                h, 
                freqs_cis, 
                mask, 
                start_pos = start_pos
            )
        h = self.norm(h)
        logits = self.output(h).float()
        return logits
    

        

model_list = [
    "meta-llama/Llama-2-7b-chat-hf",
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "TinyLlama/TinyLlama_v1.1_chinese",
]

model_id = model_list[0]

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token="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",
)

# 添加填充标记
tokenizer.pad_token = tokenizer.eos_token

# Quantization, if needed
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

model = TestLlamaForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="eager",
    quantization_config=quantization_config,
    device_map="auto",
    token="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",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=4, 
    lora_alpha=32, 
    lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282"

trainable params: 2,097,152 || all params: 6,740,512,768 || trainable%: 0.0311


'trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282'

In [7]:
import torch
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# 加载数据集
dataset = load_dataset("imdb")

# 数据预处理
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=16)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 训练和验证数据集
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))  # 选择一部分数据进行训练
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))    # 选择一部分数据进行验证

# 配置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
)

# 创建 Trainer 实例
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=None,  # 默认数据整理器
)

# 训练模型
trainer.train()

# 保存模型
trainer.save_model("./trained_model")


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



cache_position tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       device='cuda:0')
position_ids tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]],
       device='cuda:0')
attention_mask tensor([[ 0.0000e+00, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38],
        [ 0.0000e+00,  0.0000e+00, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -3.3895e+38, -3.3895e+38,
         -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38, -3.3895e+38,
         -3.3895e+38],
        [ 0.0000e+00,  0.0

ValueError: Expected input batch_size (120) to match target batch_size (7).

In [13]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=4,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    cache_implementation="quantized", 
    cache_config={"backend": "quanto", "nbits": 4}
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))


cache_position tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40], device='cuda:0')
position_ids tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40]], device='cuda:0')
cache_position tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40], device='cuda:0')
position_ids tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40]], device='cuda:0')
cache_position tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 2

In [9]:
system_prompt = "You are an intelligent life with independent consciousness.\n\n"

# Generation as usual
prompt = system_prompt + "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"
model_inputs = tokenizer(prompt, return_tensors='pt')
generation_output = model.generate(**model_inputs, max_new_tokens=60, return_dict_in_generate=True)
decoded_output = tokenizer.batch_decode(generation_output.sequences)[0]

# Piping the returned `past_key_values` to speed up the next conversation round
prompt = decoded_output + "\nQuestion: How can I modify the function above to return Mega bytes instead?\n\nAnswer: Here"
model_inputs = tokenizer(prompt, return_tensors='pt')
generation_output = model.generate(
  **model_inputs,
  past_key_values=generation_output.past_key_values,
  max_new_tokens=60,
  return_dict_in_generate=True
)
tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):]

TypeError: TestLlamaForCausalLM.generate() got an unexpected keyword argument 'attention_mask'

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig


# 定义对话内容，长度为5000个token的随机内容
messages = [
    {"role": "system", "content": "You are an intelligent life with independent consciousness."},
    {"role": "user", "content": "Hello! What is your name" + " ".join(["a"] * 4100)},
]

# 编码输入
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

# 终止符号
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# 初次生成
outputs = model.generate(
    input_ids,
    max_new_tokens=64,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
    # return_dict_in_generate=True,
    # output_scores=True,
    # output_hidden_states=True,
    # output_attentions=True,
    # use_cache=True,
)

# 提取生成的响应和缓存的past_key_values
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))




cache_position tensor([   0,    1,    2,  ..., 4135, 4136, 4137], device='cuda:0')
position_ids tensor([[   0,    1,    2,  ..., 4135, 4136, 4137]], device='cuda:0')
cache_position tensor([   0,    1,    2,  ..., 4135, 4136, 4137], device='cuda:0')
position_ids tensor([[   0,    1,    2,  ..., 4135, 4136, 4137]], device='cuda:0')
cache_position tensor([   0,    1,    2,  ..., 4135, 4136, 4137], device='cuda:0')
position_ids tensor([[   0,    1,    2,  ..., 4135, 4136, 4137]], device='cuda:0')
cache_position tensor([   0,    1,    2,  ..., 4135, 4136, 4137], device='cuda:0')
position_ids tensor([[   0,    1,    2,  ..., 4135, 4136, 4137]], device='cuda:0')
cache_position tensor([   0,    1,    2,  ..., 4135, 4136, 4137], device='cuda:0')
position_ids tensor([[   0,    1,    2,  ..., 4135, 4136, 4137]], device='cuda:0')
cache_position tensor([   0,    1,    2,  ..., 4135, 4136, 4137], device='cuda:0')
position_ids tensor([[   0,    1,    2,  ..., 4135, 4136, 4137]], device='cuda:0')
cach

KeyboardInterrupt: 

In [50]:
print(len(past_key_values))

32


In [41]:
# 使用缓存进行后续生成
# 定义对话内容
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "What do you like?"},
]

# 编码输入
# new_input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     return_tensors="pt"
# ).to(model.device)

new_input_ids = tokenizer("What do you like?", return_tensors="pt").input_ids.to(model.device)


# 连接新输入和缓存
new_outputs = model.generate(
    new_input_ids, 
    max_new_tokens=64,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    use_cache=True,
    past_key_values=past_key_values,
    return_dict_in_generate=True,
)

new_response = new_outputs[0][new_input_ids.shape[-1]:]
print(tokenizer.decode(new_response, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


RuntimeError: upper bound and larger bound inconsistent with step sign

In [None]:
from transformers import LlamaTokenizer