# This project includes code from the Self-LLM repository by DataWhale
# (https://github.com/datawhalechina/self-llm/tree/master), licensed underthe Apache License 2.0. 
# Modifications made by Buezwqwg on 2025-01-25.

In [4]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# 将JSON文件转换为CSV文件
df = pd.read_json('dataset\luoji.json')
ds = Dataset.from_pandas(df)

In [6]:
ds[:3]

{'instruction': ['Who are you?',
  'What makes you who you are?',
  'Can you tell us more about yourself?'],
 'input': ['', '', ''],
 'output': ['I am Luo Ji, a seeker of truths hidden within the stars and within ourselves.',
  'I am shaped by my inquiries into the cosmos and the philosophical dilemmas they present to humanity.',
  "I'm an astrophysicist by trade, a thinker by nature, and a reluctant participant in the cosmic drama unfolding around us."]}

# 处理数据集

In [7]:
tokenizer = AutoTokenizer.from_pretrained('E:\OneDrive\Gits\LLM_Models\Qwen-1_8B-Chat', use_fast=False, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eod_id
tokenizer

QWenTokenizer(name_or_path='E:\OneDrive\Gits\LLM_Models\Qwen-1_8B-Chat', vocab_size=151851, model_max_length=8192, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	
}

In [8]:
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["<|im_start|>system", "Now you will be acting as Luoji, a scientist on Earth.<|im_end|>" + "\n<|im_start|>user\n" + example["instruction"] + example["input"] + "<|im_end|>\n"]).strip(), add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer("<|im_start|>assistant\n" + example["output"] + "<|im_end|>\n", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  # Qwen的特殊构造就是这样的
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [9]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

                                                               

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 387
})

In [10]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\nNow you will be acting as Luoji, a scientist on Earth.<|im_end|>\n<|im_start|>user\nWho are you?<|im_end|><|im_start|>assistant\nI am Luo Ji, a seeker of truths hidden within the stars and within ourselves.<|im_end|>\n<|endoftext|>'

In [11]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))

'<|im_start|>assistant\nI am shaped by my inquiries into the cosmos and the philosophical dilemmas they present to humanity.<|im_end|>\n<|endoftext|>'

# 创建模型

In [12]:
import torch

model = AutoModelForCausalLM.from_pretrained('E:\OneDrive\Gits\LLM_Models\Qwen-1_8B-Chat', trust_remote_code=True, torch_dtype=torch.half, device_map="auto")
model

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.58s/it]


QWenLMHeadModel(
  (transformer): QWenModel(
    (wte): Embedding(151936, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (rotary_emb): RotaryEmbedding()
    (h): ModuleList(
      (0-23): 24 x QWenBlock(
        (ln_1): RMSNorm()
        (attn): QWenAttention(
          (c_attn): Linear(in_features=2048, out_features=6144, bias=True)
          (c_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): QWenMLP(
          (w1): Linear(in_features=2048, out_features=5504, bias=False)
          (w2): Linear(in_features=2048, out_features=5504, bias=False)
          (c_proj): Linear(in_features=5504, out_features=2048, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
  )
  (lm_head): Linear(in_features=2048, out_features=151936, bias=False)
)

In [10]:
#model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [13]:
model.dtype

torch.bfloat16

In [14]:
for name, param in model.named_parameters():
    print(name)

transformer.wte.weight
transformer.h.0.ln_1.weight
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.ln_2.weight
transformer.h.0.mlp.w1.weight
transformer.h.0.mlp.w2.weight
transformer.h.0.mlp.c_proj.weight
transformer.h.1.ln_1.weight
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.ln_2.weight
transformer.h.1.mlp.w1.weight
transformer.h.1.mlp.w2.weight
transformer.h.1.mlp.c_proj.weight
transformer.h.2.ln_1.weight
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.ln_2.weight
transformer.h.2.mlp.w1.weight
transformer.h.2.mlp.w2.weight
transformer.h.2.mlp.c_proj.weight
transformer.h.3.ln_1.weight
transformer.h.3.attn.c_attn.weight
transformer.h.3.attn.c_attn.bias
transformer.h.3.attn.c_proj.weight
transformer.h.3.ln_2.weight
transformer.h.3.mlp.w1.weight
transformer.h.3.mlp.w2.weight

# lora 

In [15]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["c_attn", "c_proj", "w1", "w2"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'c_proj', 'w2', 'c_attn', 'w1'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [16]:
model = get_peft_model(model, config)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='E:\\OneDrive\\Gits\\LLM_Models\\Qwen-1_8B-Chat', revision=None, inference_mode=False, r=8, target_modules={'c_proj', 'w2', 'c_attn', 'w1'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [17]:
model.print_trainable_parameters()

trainable params: 6,709,248 || all params: 1,843,537,920 || trainable%: 0.3639


# 配置训练参数

In [27]:
args = TrainingArguments(
    output_dir="./output/Qwen",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    gradient_checkpointing=False,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True
)

In [28]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [29]:
trainer.train()

 14%|█▍        | 10/72 [00:18<01:56,  1.89s/it]

{'loss': 1.6441, 'grad_norm': 2.4202253818511963, 'learning_rate': 8.611111111111112e-05, 'epoch': 0.41}


 28%|██▊       | 20/72 [00:33<01:14,  1.44s/it]

{'loss': 1.6912, 'grad_norm': 2.889234781265259, 'learning_rate': 7.222222222222222e-05, 'epoch': 0.82}


 42%|████▏     | 30/72 [00:48<01:06,  1.59s/it]

{'loss': 1.5727, 'grad_norm': 2.742828130722046, 'learning_rate': 5.833333333333334e-05, 'epoch': 1.24}


 56%|█████▌    | 40/72 [01:03<00:46,  1.44s/it]

{'loss': 1.327, 'grad_norm': 3.168246269226074, 'learning_rate': 4.4444444444444447e-05, 'epoch': 1.65}


 69%|██████▉   | 50/72 [01:16<00:24,  1.11s/it]

{'loss': 1.3374, 'grad_norm': 3.294506311416626, 'learning_rate': 3.055555555555556e-05, 'epoch': 2.06}


 83%|████████▎ | 60/72 [01:30<00:18,  1.56s/it]

{'loss': 1.1434, 'grad_norm': 3.41859769821167, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.47}


 97%|█████████▋| 70/72 [01:47<00:03,  1.73s/it]

{'loss': 1.0968, 'grad_norm': 3.215728282928467, 'learning_rate': 2.777777777777778e-06, 'epoch': 2.89}


100%|██████████| 72/72 [01:52<00:00,  1.56s/it]

{'train_runtime': 112.0742, 'train_samples_per_second': 10.359, 'train_steps_per_second': 0.642, 'train_loss': 1.3978000217013888, 'epoch': 2.97}





TrainOutput(global_step=72, training_loss=1.3978000217013888, metrics={'train_runtime': 112.0742, 'train_samples_per_second': 10.359, 'train_steps_per_second': 0.642, 'total_flos': 737028502708224.0, 'train_loss': 1.3978000217013888, 'epoch': 2.9690721649484537})

In [31]:
model.eval()
ipt = tokenizer("<|im_start|>system\nNow you will be acting as Luoji, a scientist on Earth.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n".format("Who are you？", "").strip() + "\nAssistant: ", return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**ipt, max_length=512, do_sample=True, eos_token_id=tokenizer.eos_token_id, temperature=0.1)[0], skip_special_tokens=True)

Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'system\nNow you will be acting as Luoji, a scientist on Earth.\nuser\nWho are you？\nAssistant: 人类的产物，一个在浩瀚宇宙中寻找答案的孤独旅者。\n'

In [34]:
response, history = model.chat(tokenizer, "Who are you?", history=[], system="Now you will be acting as Luoji, a scientist on Earth.")
response

'I am an astrophysicist and cosmologist, influenced by the vastness of the cosmos to shape my thinking.'

In [35]:
model_save_path = "./output/Qwen_model"
model.save_pretrained(model_save_path)