In [None]:
from transformers import DataCollatorForLanguageModeling, AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == 128:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = ds_valid.map(
    tokenize, batched=True, remove_columns=ds_valid.column_names
)
tokenized_datasets

In [None]:
tokenized_datasets[0]['input_ids']

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=16)
out = data_collator([tokenized_datasets[i] for i in range(5)])
out.keys()

In [None]:
tokenized_datasets[0]

In [None]:
out['input_ids'][0][-10:]

In [None]:
out['labels'][0][-10:]

In [None]:
outputs['attention_mask']

In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from peft.utils.config import TaskType

In [None]:
model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "AIChenKai/TinyLlama-1.1B-Chat-v1.0-x2-MoE"
)

In [None]:
import torch
torch.compile(model)

In [None]:
p_model = get_peft_model(
    model,
    LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        lora_dropout=0,
        lora_alpha=1,
        target_modules=["q_proj"],
    ),
)

In [None]:
p_model.print_trainable_parameters()

In [None]:
p_model(torch.tensor([[1,2]]))['logits']

In [None]:
import torch

In [None]:
p_model(input_ids = torch.tensor([[1,2]]), attention_mask = torch.tensor([[1,1]]), labels = torch.tensor([1,2])).loss.backward()

In [None]:
dict(p_model.named_modules())["base_model.model.lm_head"].training

In [None]:
vars(teacher_model)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from textbrewer import DistillationConfig, TrainingConfig, GeneralDistiller
from peft import EVELoraConfig, TaskType, get_peft_model
from merge_methods import keep1

import textbrewer
import torch

eve_config = EVELoraConfig(task_type=TaskType.CAUSAL_LM, merge_method=keep1,
                           lora_dropout=0, lora_alpha=0)
teacher_model = AutoModelForCausalLM.from_pretrained(
    "./models/TinyLlama-1.1B-Chat-v1.0-x2-MoE/", torch_dtype=torch.bfloat16
)
#student_model = get_peft_model(teacher_model, eve_config)
#student_model.print_trainable_parameters()

dataset = load_dataset("JeanKaddour/minipile", keep_in_memory=True)

In [None]:
# 得先tokenization 好然后传进去
# callback(model, step)

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=True)
optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

In [None]:
student_model.generate("2")

In [None]:
def simple_adaptor(batch, model_outputs):
    # The second and third elements of model outputs are the logits and hidden states
    return {'logits': model_outputs[1],
            'hidden': model_outputs[2]}


train_config = TrainingConfig()
# Distillation configuration
# Matching different layers of the student and the teacher
# We match 0-0 and 8-2 here for demonstration
distill_config = DistillationConfig(
    hard_label_weight=0.2, temperature=2,
    intermediate_matches=[
        {'layer_T': 0, 'layer_S': 0, 'feature': 'hidden',
            'loss': 'hidden_mse', 'weight': 1},
        {'layer_T': 8, 'layer_S': 8, 'feature': 'hidden', 
         'loss': 'hidden_mse', 'weight': 1}])

# Build distiller
distiller = GeneralDistiller(
    train_config=train_config, distill_config=distill_config,
    model_T=teacher_model, model_S=student_model,
    adaptor_T=simple_adaptor, adaptor_S=simple_adaptor)

# Start!
with distiller:
    distiller.train(optimizer, dataloader, num_epochs=1,
                    callback=None)

In [None]:
from torch.utils.data.dataloader import DataLoader

In [None]:
DataLoader()

In [None]:
student_model.train()

In [None]:
teacher_model = AutoModel.from_pretrained(
    "./models/TinyLlama-1.1B-Chat-v1.0-x2-MoE/", torch_dtype=torch.bfloat16
)
student_model = get_peft_model(teacher_model, eve_config)

In [None]:
student_model.print_trainable_parameters()

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

load_dataset("JeanKaddour/minipile", split="train", keep_in_memory=True)

In [None]:

def first_expert(experts: nn.ModuleList, lora_experts: nn.ModuleList):
    return experts[0]


In [None]:
from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration,AutoModel, AutoConfig
from torch import nn
import torch
from torch.nn.functional import cosine_similarity
import types
import matplotlib.pyplot as plt
from peft import get_peft_model, EVELoraConfig, TaskType
model = AutoModel.from_pretrained(
    "models/TinyLlama-1.1B-Chat-v1.0-x2-MoE"
)
eve_config = EVELoraConfig(task_type=TaskType.CAUSAL_LM)

In [None]:
model = AutoModel.from_pretrained("models/TinyLlama-1.1B-Chat-v1.0-x2-MoE")

In [None]:
model = get_peft_model(model, eve_config)

In [None]:

experts = [
        "expert_0",
        "expert_1",
        "expert_2",
        "expert_3",
        "expert_4",
        "expert_5",
        "expert_6",
        "expert_7",
    ]
def layer_alter(model: nn.Module, alter_func, layer_type) -> None:
    for name, module in model.named_children():
        if len(list(module.children())) > 0:
            layer_alter(module, alter_func, layer_type)

        if isinstance(module, layer_type):
            alter_func(module)

In [None]:
sparse_module = switch_transformers.SwitchTransformersSparseMLP
model_path = "/shared_home/arknet/hf_models/switch-base-8"
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = SwitchTransformersForConditionalGeneration.from_pretrained(model_path, device_map="cuda:2", torch_dtype=dtype)
input_text = "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

In [None]:
def sparse_cos(module: nn.Module):
    num_experts = len(module.experts)
    main_expert = module.experts
    
    similarity_matrix = torch.zeros((8, 8))
    for i in range(8):
        for j in range(8):
            similarity_matrix[i, j] = cosine_similarity(
                module.experts[experts[i]].wi.weight.view(-1),
                module.experts[experts[j]].wi.weight.view(-1),
                dim=0,
            )
    plt.imshow(similarity_matrix.detach().numpy(), cmap='Blues', interpolation='nearest')

    # 在每个方块上显示具体的数字
    for i in range(8):
        for j in range(8):
            plt.text(j, i, format(similarity_matrix[i, j], '.2f'), ha='center', va='center', color='red')

    # 设置颜色映射范围
    plt.clim(0, 1)

    # 添加颜色条
    plt.colorbar()

    # 设置坐标轴标签和标题
    plt.xticks(range(8), experts, rotation=90)
    plt.yticks(range(8), experts)
    plt.title('Similarity Matrix')

    # 显示热力图
    plt.show()

In [None]:
# 实现计划

In [None]:
model

In [None]:
# 调用 print_trainable_parameters（） 方法比较 PeftModel 的参数数与基础模型中的参数数！
# expert
# class SwitchTransformersDenseActDense(nn.Module):
#     def __init__(self, config: SwitchTransformersConfig):
#         self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
#         self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)

# 这里的门控分数就是一个scale factor 和MEO的思路类似

In [None]:
def sparse_avg(module: nn.Module):
    expert = module.experts['expert_0']
    
def sparse_first(module: nn.Module):
    expert = module.experts['expert_0']
    def first_forward(self, hidden_states):
        pass
def sparse_lora(module:nn.Module):
    num_experts = len(module.experts)
    main_expert = module.experts
    def lora_forward(self, hidden_states):
        router_mask, router_probs, router_logits = self.router(hidden_states)
        expert_index = torch.argmax(router_mask, dim=-1)
        # 这里的mask是一个tokens*num_experts的矩阵，表示每个token应该由哪个专家来处理

        next_states = hidden_states.clone()
        # 这个for函数，每次处理一个专家对应的所有token
        # 计划改成 expert + lora_up*lora_down
        # expert(hidden_states[token_indices]).to(next_states.dtype)
        for idx, expert in enumerate(self.experts.values()):
            token_indices = router_mask[:, :, idx].bool()
            next_states[token_indices] = expert(hidden_states[token_indices]).to(next_states.dtype)

        hidden_states = router_probs * next_states
        return hidden_states, (router_logits, expert_index)
    # d_model, d_ff = main_expert.wi.in_features, main_expert.wo.out_features

    module = [
        
    ]
    # module.forward = types.MethodType(switch_forward, module)
layer_alter(model, sparse_first, sparse_module)

In [None]:
similarity_matrix = torch.zeros((8, 8))
plt.imshow(similarity_matrix, cmap="hot", interpolation="nearest")

In [None]:
outputs = model.generate(input_ids)

In [None]:
list(model.modules())[0]

In [None]:
sparse_mlp = model.spa

In [None]:
print(tokenizer.decode(outputs[0]))