In [1]:
import sys
sys.path.insert(0, './src')  # 或者绝对路径：/full/
from src.xmcdata import *
from llm import ModelConfig,LLMTrainer,KeyphrasePredictor

In [2]:
dataset_name = 'eurlex-4k'
data_dir = f"xmc-base/{dataset_name}"

label_map = load_label_text_map(data_dir + "/output-items.txt")

# training dataset
X_trn_text = load_texts(data_dir+"/X.trn.txt")
Y_trn_feat = load_sparse_matrix(data_dir+"/Y.trn.npz")

Y_trn_text,Y_trn_num = csr_id_to_text(Y_trn_feat,label_map)

Y_trn_list= [",".join(y) for y in Y_trn_text]

# validation dataset
X_tst_text = load_texts(data_dir+"/X.tst.txt")
Y_tst_feat = load_sparse_matrix(data_dir+"/Y.tst.npz")

Y_tst_text, Y_tst_num = csr_id_to_text(Y_tst_feat,label_map)
Y_tst_list = [",".join(y) for y in Y_tst_text]

In [3]:
stemmed_template = "Summarize the following document with keyphrases:\n\nDocument: {document}\n\nSummary of this paragraph by unstemmed keyphrases: "
normal_template = "Summarize the following document with keyphrases:\n\nDocument: {document}\n\nSummary of this paragraph by keyphrases: "
model_name = "unsloth/Llama-3.2-3B-Instruct"  # 可以替换为其他模型如 "meta-llama/Llama-2-7b-hf"
llm_train_config = ModelConfig(
    model_name=model_name,  # 可以替换为其他模型如 "meta-llama/Llama-2-7b-hf"
    max_length=256,
    batch_size=2,
    learning_rate=2e-4,
    num_epochs=3,
    use_quantization=True,
    quantization_type="fp16",  # 可选: "int4", "int8", "fp16", "fp32"
    output_dir="./keyphrase_model"+dataset_name,
    lora_r= 16,
    lora_alpha= 32,
    lora_dropout= 0.1,
    prompt_template=stemmed_template,
    max_new_tokens = 128 # 生成的最大新令牌数
    )

In [4]:
trainer = LLMTrainer(llm_train_config)
#加载模型
# setting lora 
trainer.setup_lora()
# prepare dataset
train_dataset = trainer.data_processor.prepare_data_from_lists(
    documents=X_trn_text,
    keyphrases=Y_trn_list
)
val_dataset = trainer.data_processor.prepare_data_from_lists(
    documents=X_tst_text,
    keyphrases=Y_tst_list
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!
trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511083625021551
LoRA configuration applied successfully!


In [None]:
val_dataset

In [None]:
# tokenize数据集
train_dataset = train_dataset.map(
    trainer.data_processor.tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
    )
eval_dataset = val_dataset.map(
    trainer.data_processor.tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names)


Map:   0%|          | 0/15449 [00:00<?, ? examples/s]

Map:   0%|          | 0/3865 [00:00<?, ? examples/s]

In [10]:
for ex in train_dataset.select(range(10)):
    print(len(ex['input_ids']))
    print(len(ex['labels']))
    print(len(ex['attention_mask']))


304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304
304


In [7]:
trainer.train(train_dataset,val_dataset)
# 保存模型
trainer.save_model(save_path=llm_train_config.output_dir)

Starting training...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
preictor = KeyphrasePredictor(trainer=trainer)
# 预测
predictions = preictor.predict(X_tst_text)
#

In [None]:
 保存预测结果
with open(data_dir +"/pred_"+ model_name + ".txt", "w") as f:
    for pred in predictions:
        f.write(f"Predicted Keyphrases: {pred}\n")