## 1. 数据加载


In [14]:
from knowledge_graph_completion.data import data_loader

In [15]:
# 加载 FB15k-237 数据集的训练、验证和测试集
train_data, valid_data, test_data = data_loader.load_fb15k237("./knowledge_graph_completion/data/FB15k-237/")
print("FB15k-237 数据集加载完成。")
print(f"训练集大小: {len(train_data)} 条三元组")
print(f"验证集大小: {len(valid_data)} 条三元组")
print(f"测试集大小: {len(test_data)} 条三元组")

FB15k-237 数据集加载完成。
训练集大小: 272115 条三元组
验证集大小: 17535 条三元组
测试集大小: 20466 条三元组


In [16]:
train_data[['head_name','relation','tail_name']].head()

Unnamed: 0,head_name,relation,tail_name
0,Dominican_Republic,/location/country/form_of_government,Republic
1,Mighty_Morphin_Power_Rangers,/tv/tv_program/regular_cast./tv/regular_tv_app...,Wendee_Lee
2,Drama,/media_common/netflix_genre/titles,American_History_X
3,Michelle_Rodriguez,/award/award_winner/awards_won./award/award_ho...,Naveen_Andrews
4,Australia_national_association_football_team,/soccer/football_team/current_roster./sports/s...,Midfielder


## 2. 数据预览

In [17]:
# 定义一个函数便于打印三元组列表的前几项
def preview_triples(name, df, num=5):
    print(f"\n{name} 集合样本前 {num} 条：")
    for _, row in df.head(num).iterrows():
        print(f"{row['head_name']}\t{row['relation']}\t{row['tail_name']}")
    print("...")

In [18]:
preview_triples("训练", train_data, num=5)
preview_triples("验证", valid_data, num=5)
preview_triples("测试", test_data, num=5)


训练 集合样本前 5 条：
Dominican_Republic	/location/country/form_of_government	Republic
Mighty_Morphin_Power_Rangers	/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor	Wendee_Lee
Drama	/media_common/netflix_genre/titles	American_History_X
Michelle_Rodriguez	/award/award_winner/awards_won./award/award_honor/award_winner	Naveen_Andrews
Australia_national_association_football_team	/soccer/football_team/current_roster./sports/sports_team_roster/position	Midfielder
...

验证 集合样本前 5 条：
American_Pie	/film/film/genre	Romance_Film
St._Louis	/location/location/time_zones	Central_Time_Zone
George_Burns	/people/person/spouse_s./people/marriage/type_of_union	Marriage
Primetime_Emmy_Award_for_Outstanding_Writing_-_Drama_Series	/award/award_category/winners./award/award_honor/award_winner	David_Chase
Silent_Hill	/film/film/release_date_s./film/film_regional_release_date/film_release_region	Lithuania
...

测试 集合样本前 5 条：
Zürich	/travel/travel_destination/climate./travel/travel_destination_monthly_climat

## 3. 构建 Few-Shot 提示

In [19]:
triples = list(train_data[['head_name','relation','tail_name']].itertuples(index=False, name=None))


In [20]:
# 从训练集中选择关系为 "founded" 的 few-shot 示例
relation = "/location/country/form_of_government"
K = 3  # 选取3条few-shot示例
fewshot_examples = [trip for trip in triples if trip[1] == relation][:K]

In [21]:
fewshot_examples

[('Dominican_Republic', '/location/country/form_of_government', 'Republic'),
 ('Saint_Vincent_and_the_Grenadines',
  '/location/country/form_of_government',
  'Parliamentary_system'),
 ('Liechtenstein',
  '/location/country/form_of_government',
  'Parliamentary_system')]

In [22]:
query_head = "Dominican_Republic"
query_relation = "/location/country/form_of_government"
prompt = ""
for h, r, t in fewshot_examples:
    prompt += f"{h} - {r} -> {t}\n"
prompt += f"{query_head} - {query_relation} ->"

print("\n构建的提示 Prompt:\n" + prompt)


构建的提示 Prompt:
Dominican_Republic - /location/country/form_of_government -> Republic
Saint_Vincent_and_the_Grenadines - /location/country/form_of_government -> Parliamentary_system
Liechtenstein - /location/country/form_of_government -> Parliamentary_system
Dominican_Republic - /location/country/form_of_government ->


## 4. 模型推理生成尾实体

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [25]:
# 加载 Qwen3-0.6B 模型和分词器
model_name = "./knowledge_graph_completion/models/Qwen3-0.6B/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [26]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [27]:
model.to(device)                      # 把模型权重移动到 GPU
model.eval()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [28]:
# 将提示编码为模型输入并生成文本
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids.to(device)            # GPU Tensor
attention_mask = inputs.attention_mask.to(device)  # 如果有

In [29]:
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=10,
    do_sample=False
)



In [30]:
#    只取 prompt 后生成部分
generated_ids = outputs[0, input_ids.shape[1]:].cpu()
raw = tokenizer.decode(generated_ids, skip_special_tokens=True)

In [31]:
# 只保留第一行（遇到换行就截断）
pred = raw.splitlines()[0].strip()
print("预测尾实体：", pred)

预测尾实体： Parliamentary_system


## 5. Hits@1 和 MRR 指标评估

In [32]:
from tqdm import tqdm

# 评估前 N 条测试集样本
N = min(1000, len(test_data))
hits1_count = 0
ranks = []

In [33]:
for i, (_, row) in enumerate(tqdm(test_data.head(N).iterrows(), total=N, desc="Evaluating")):
    
    # 直接从 row 里取列：
    h = row['head_name']      # 或者 row['head']，如果想用 MID
    r = row['relation']       # 原始谓词标识
    true_t = row['tail_name'] # 或者 row['tail']
    
    # 构建 Few-Shot 提示
    fs = train_data[train_data['relation'] == r].head(3)
    prompt_i = ""
    for _, ex in fs.iterrows():
        prompt_i += f"{ex['head_name']} - {ex['relation']} -> {ex['tail_name']}\n"
    prompt_i += f"{h} - {r} ->"
    
    # 模型生成预测
    inputs_i = tokenizer(prompt_i, return_tensors="pt")
    input_ids = inputs_i.input_ids.to(device)            # GPU Tensor
    attention_mask = inputs_i.attention_mask.to(device)  # 如果有
    outputs = model.generate(input_ids=input_ids,attention_mask=attention_mask,max_new_tokens=10,do_sample=False)
    generated_ids = outputs[0, input_ids.shape[1]:].cpu()
    pred = tokenizer.decode(generated_ids, skip_special_tokens=True).splitlines()[0].strip()

    # 统计 Hits@1 与 MRR
    hit = int(pred == true_t)
    hits1_count += hit
    ranks.append(1.0 if hit else 0.0)

Evaluating: 100%|██████████| 1000/1000 [06:23<00:00,  2.61it/s]


In [34]:
hits1 = hits1_count / N
mrr   = sum(ranks) / N
print(f"Hits@1 = {hits1:.4f}, MRR = {mrr:.4f}")

Hits@1 = 0.1470, MRR = 0.1470


## 6. 训练

In [1]:
from knowledge_graph_completion.infer import evaluate_model
from knowledge_graph_completion.utils import calculate_metrics
from knowledge_graph_completion.data.data_loader import load_fb15k237
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# 加载数据（已含 head_name, tail_name, relation_name）
train_data, valid_data, test_data = load_fb15k237("./knowledge_graph_completion/data/FB15k-237/")

In [4]:
train_data.head()

Unnamed: 0,head,relation,tail,head_name,tail_name
0,/m/027rn,/location/country/form_of_government,/m/06cx9,Dominican_Republic,Republic
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0,Mighty_Morphin_Power_Rangers,Wendee_Lee
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3,Drama,American_History_X
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_,Michelle_Rodriguez,Naveen_Andrews
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8,Australia_national_association_football_team,Midfielder


In [5]:
# 加载分词器与模型
model_name = "./knowledge_graph_completion/models/Qwen3-0.6B/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [6]:
N = min(1000, len(test_data))
hits1, mrr = evaluate_model(model, tokenizer, test_data.head(N), train_data, num_examples=3)

Batched Evaluating: 100%|██████████| 63/63 [04:49<00:00,  4.60s/it]


In [8]:
print(f"Hits@1 = {hits1:.4f}, MRR = {mrr:.4f}")

Hits@1 = 0.1510, MRR = 0.1628


### 训练

In [5]:
from knowledge_graph_completion.train import train_model

In [6]:
# 2. 构建微调样本（随机挑 1000 条）
sample_df_train = train_data.sample(n=10000, random_state=42).reset_index(drop=True)
sample_df_test = test_data.sample(n=1000, random_state=42).reset_index(drop=True)

In [7]:
# train_model 内部会将验证集上的最佳模型保存在 outputs/best_model/
train_model(
    model=model,
    tokenizer=tokenizer,
    train_data=sample_df_train,      # DataFrame，含 head/relation/tail 和 head_name/... 列
    valid_data=sample_df_test,      # DataFrame，用于早停 & 模型选择
    epochs=10,
    batch_size=16,
    learning_rate=5e-5,
    output_dir="./outputs",     # best_model 将保存在 outputs/best_model/
    num_examples=3,
    top_k=10
)
print("微调训练完毕，最佳模型已保存到 outputs/best_model/")

Epoch 1: 100%|██████████| 625/625 [24:51<00:00,  2.39s/it]


[Epoch 1] 平均 Loss: 2.3500


Evaluating: 100%|██████████| 1000/1000 [06:45<00:00,  2.47it/s]


[验证] Hits@1: 0.0060, MRR: 0.0202
→ 保存最佳模型，MRR: 0.0202


Epoch 2:  11%|█         | 69/625 [02:49<22:42,  2.45s/it]


KeyboardInterrupt: 

### 测试

In [None]:
best_dir = "outputs/best_model"
tokenizer = AutoTokenizer.from_pretrained(best_dir)
model     = AutoModelForCausalLM.from_pretrained(best_dir).to(device)
model.eval()
print("已加载本地最佳模型。")

In [None]:
hits1, mrr = evaluate_model(
    model,
    tokenizer,
    test_data,   # 这里只评估前1000条以加快速度
    train_data,             # Few-Shot 示例来源
    num_examples=3,
    top_k=10
)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [None]:
import torch
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# 取训练集里随机 1000 条三元组做示例微调
sample_df = train_data.sample(n=1000, random_state=42).reset_index(drop=True)

optimizer = AdamW(model.parameters(), lr=5e-5)
EPOCHS = 2

for epoch in range(EPOCHS):
    total_loss = 0.0
    for _, row in sample_df.iterrows():
        h = row['head_name']
        r = row['relation_name']
        t = row['tail_name']
        
        # 构建硬提示：只用当前三元组，不插 Few‑Shot 示例
        prompt_train = f"{h} - {r} -> {t}"
        # 让模型学习在提示后续生成尾实体 t
        inputs = tokenizer(prompt_train, return_tensors="pt")
        input_ids = inputs.input_ids.to(device)
        # 标签与输入一致，让模型自回归学习
        labels = input_ids.clone()
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(sample_df)
    print(f"Epoch {epoch+1}/{EPOCHS} — 平均 Loss: {avg_loss:.4f}")

# 切回评估模式
model.eval()
