In [1]:
import sys
sys.path.insert(0, './src')  # 或者绝对路径：/full/
from src.xmcdata import *
from llm import ModelConfig,LLMTrainer,KeyphrasePredictor,DataProcessor

In [2]:
dataset_name = 'eurlex-4k'
data_dir = f"xmc-base/{dataset_name}"

label_map = load_label_text_map(data_dir + "/output-items.txt")

# training dataset
X_trn_text = load_texts(data_dir+"/X.trn.txt")
Y_trn_feat = load_sparse_matrix(data_dir+"/Y.trn.npz")

Y_trn_text,Y_trn_num = csr_id_to_text(Y_trn_feat,label_map)

Y_trn_list= [",".join(y) for y in Y_trn_text]

# validation dataset
X_tst_text = load_texts(data_dir+"/X.tst.txt")
Y_tst_feat = load_sparse_matrix(data_dir+"/Y.tst.npz")

Y_tst_text, Y_tst_num = csr_id_to_text(Y_tst_feat,label_map)
Y_tst_list = [",".join(y) for y in Y_tst_text]

In [3]:
stemmed_input_template = "Summarize the following document with keyphrases:\n\nDocument: {document}"
normal_input_template = "Summarize the following document with keyphrases:\n\nDocument: {document}"
stemmed_output_template = "Summary of this paragraph by unstemmed keyphrases: {keyphrases}"  # 输出模板
output_template = "Summary of this paragraph by keyphrases: {keyphrases}"  # 输出模板
model_name = "unsloth/Llama-3.2-3B-Instruct"  # 可以替换为其他模型如 "meta-llama/Llama-2-7b-hf"
llm_train_config = ModelConfig(
    model_name=model_name,  # 可以替换为其他模型如 "meta-llama/Llama-2-7b-hf"
    max_length=512,
    batch_size=2,
    learning_rate=2e-4,
    num_epochs=3,
    use_quantization=True,
    quantization_type="fp16",  # 可选: "int4", "int8", "fp16", "fp32"
    output_dir="./ouput/"+dataset_name,
    lora_r= 16,
    lora_alpha= 32,
    lora_dropout= 0.1,
    prompt_template=stemmed_input_template,
    max_new_tokens = 128 # 生成的最大新令牌数
    )

In [4]:
stemmed_input_template = "Summarize the following document with keyphrases:\n\nDocument: {document}"
normal_input_template = "Summarize the following document with keyphrases:\n\nDocument: {document}"
stemmed_output_template = "Summary of this paragraph by unstemmed keyphrases: {keyphrases}"  # 输出模板
output_template = "Summary of this paragraph by keyphrases: {keyphrases}"  # 输出模板
trainer = LLMTrainer(llm_train_config)
#加载模型
# setting lora 
trainer.setup_lora()
# prepare dataset
data_processor = DataProcessor(tokenizer=trainer.tokenizer,max_length_input=384,
                                max_length_output=128,  # 输出的最大长度
                                max_length = trainer.config.max_length,  # 输入的最大长度
                                prompt_template = stemmed_input_template,
                                res_template = stemmed_output_template
                               )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!
trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511083625021551
LoRA configuration applied successfully!


In [5]:
train_dataset = data_processor.prepare_dataset(documents=X_trn_text,
                                               keyphrases=Y_trn_list,num_proc=8)
val_dataset = data_processor.prepare_dataset(documents=X_tst_text,
                                             keyphrases=Y_tst_list,num_proc=8)
data_processor.save_dataset(train_dataset, data_dir+"/train_dataset")
data_processor.save_dataset(val_dataset, data_dir+"/val_dataset")

Tokenizing dataset (num_proc=8):   0%|          | 0/15449 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15449 [00:00<?, ? examples/s]

Tokenizing dataset (num_proc=8):   0%|          | 0/3865 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3865 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15449 [00:00<?, ? examples/s]

Dataset saved to xmc-base/eurlex-4k/train_dataset


Saving the dataset (0/1 shards):   0%|          | 0/3865 [00:00<?, ? examples/s]

Dataset saved to xmc-base/eurlex-4k/val_dataset


In [5]:
train_dataset = data_processor.load_dataset(data_dir+"/train_dataset")
val_dataset = data_processor.load_dataset(data_dir+"/val_dataset")

Dataset loaded from xmc-base/eurlex-4k/train_dataset
Dataset loaded from xmc-base/eurlex-4k/val_dataset


In [6]:
output_dir=f"./output/{dataset_name}/{model_name}"
trainer.load_trained_model(model_path=output_dir)

Loading trained model from ./output/eurlex-4k/unsloth/Llama-3.2-3B-Instruct


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trained model loaded successfully!


In [8]:
keyphrasePredictor = KeyphrasePredictor(trainer=trainer)
res = keyphrasePredictor.predict(documents=X_tst_text[0:2],max_new_tokens=128)

In [9]:
res

['Summarize the following document with keyphrases:\nDocument: commiss decis juli lai detail rule applic franc articl regul ec special market support measur beef sector notifi document number french text authent ec commiss european commun regard treati establish european commun regard council regul ec common organis market beef veal articl thereof regard commiss regul ec april special market support measur beef sector articl thereof articl regul ec releas approv commiss meat bought regul laid releas specif commun made avoid disturb market direct competit dispos intervent product applic articl indent subparagraph articl regul ec franc bought quantiti beef franc continu bui meat partial tender end year franc submit request commiss releas tonn beef bought scheme intend distribut depriv person franc meet elig criteria appli commiss regul eec octob lai detail rule suppli food intervent stock benefit depriv person commun amend regul ec request approv releas follow procedur laid indent articl

In [8]:
def check_all_labels_ignored(dataset, sample_size:int|None =10):
    """
    检查 dataset 中是否存在 labels 全为 -100 的样本
    如果 sample_size=None，则检查整个 dataset
    """
    total = len(dataset)
    check_range = range(total) if sample_size is None else range(min(sample_size, total))
    error_count = 0

    for i in tqdm(check_range):
        labels = dataset[i]["labels"]
        if isinstance(labels, list):
            labels_tensor = torch.tensor(labels)
        elif isinstance(labels, torch.Tensor):
            labels_tensor = labels
        else:
            raise ValueError(f"Unsupported label type: {type(labels)}")

        if (labels_tensor != -100).sum() == 0:
            print(f"⚠️ Warning: Sample {i} has all labels == -100")
            error_count += 1

    print(f"✅ Checked {len(check_range)} samples. Found {error_count} with all -100 labels.")

check_all_labels_ignored(train_dataset, sample_size=None)
check_all_labels_ignored(val_dataset, sample_size=None)

✅ Checked 15449 samples. Found 0 with all -100 labels.
✅ Checked 3865 samples. Found 0 with all -100 labels.


In [8]:
# 1. 取一个 batch 样本
sample = train_dataset[0]
sample = {k: torch.tensor(v).unsqueeze(0).to(trainer.model.device) for k, v in sample.items()}

# 2. 执行 forward
output = trainer.model(**sample)

# 3. 检查输出
print("=== Forward 输出 ===")
print("loss:", output.loss)
print("loss.requires_grad:", output.loss.requires_grad)

=== Forward 输出 ===
loss: tensor(4.9695, device='cuda:0')
loss.requires_grad: False


In [None]:
trainer.train(train_dataset,val_dataset,llm_train_config.output_dir)
# 保存模型
trainer.save_model(save_path=llm_train_config.output_dir)

Starting training...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


self.model.training:  True


Step,Training Loss,Validation Loss
500,0.8619,0.838727
1000,0.7086,0.727005
1500,0.616,0.699808
2000,0.6879,0.648379


KeyboardInterrupt: 

In [None]:
preictor = KeyphrasePredictor(trainer=trainer)
# 预测
predictions = preictor.predict(X_tst_text)
#

In [None]:
#保存预测结果
with open(data_dir +"/pred_"+ model_name + ".txt", "w") as f:
    for pred in predictions:
        f.write(f"Predicted Keyphrases: {pred}\n")