#### 知识图谱分析

In [31]:
from collections import defaultdict

In [32]:
# 从文件中加载实体和关系的映射
def load_entity_relation_mappings(output_dir):
    entity2id = {}
    relation2id = {}

    # 读取 entity2id 文件
    with open(f"{output_dir}/entity2id.txt", 'r', encoding='utf-8') as e_file:
        for line in e_file:
            entity, eid = line.strip().split()
            entity2id[entity] = int(eid)

    # 读取 relation2id 文件
    with open(f"{output_dir}/relation2id.txt", 'r', encoding='utf-8') as r_file:
        for line in r_file:
            relation, rid = line.strip().split()
            relation2id[relation] = int(rid)

    return entity2id, relation2id

# 调用
output_dir = "dataset/processed"
entity2id, relation2id = load_entity_relation_mappings(output_dir)

In [33]:
# 加载三元组并进行分析
def load_triples(file_path):
    triples = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            h, r, t = line.strip().split()  # 三元组
            triples.append((h, r, t))  # 直接返回三元组中的字符串
    return triples

# 调用
train_triples = load_triples(f"{output_dir}/train.txt")
test_triples = load_triples(f"{output_dir}/test.txt")

1. 基本统计：分析实体和关系的数量

In [34]:
def basic_stats(entity2id, relation2id):
    num_entities = len(entity2id)
    num_relations = len(relation2id)

    print(f"实体总数: {num_entities}")
    print(f"关系总数: {num_relations}")
    print()


2. 关系频率分析：分析训练集中的关系频率，识别最常见的关系。

In [35]:
def analyze_relation_frequencies(triples):
    relation_count = defaultdict(int)

    for h, r, t in triples:
        relation_count[r] += 1

    # 按照关系出现次数从高到低排序
    sorted_relations_by_count = sorted(relation_count.items(), key=lambda x: x[1], reverse=True)
    
    # 输出前5个最常见的关系
    print("前5个最常见的关系:")
    for relation, count in sorted_relations_by_count[:5]:
        print(f"关系: {relation}  出现次数: {count}")
    print()

3. 实体连接分析：统计每个实体的度，找出在知识图谱中最活跃的实体。

In [36]:
def analyze_entity_degrees(triples):
    entity_degree = defaultdict(int)

    for h, r, t in triples:
        entity_degree[h] += 1
        entity_degree[t] += 1
    
    # 按照实体度从高到低排序
    sorted_entities_by_degree = sorted(entity_degree.items(), key=lambda x: x[1], reverse=True)
    
    # 输出前10个实体的度
    print("前10个实体的度:")
    for entity, degree in sorted_entities_by_degree[:10]:
        print(f"实体: {entity}  度数: {degree}")
    print()

4. 训练集和测试集划分分析

In [37]:
def analyze_train_test_split(train_triples, test_triples):
    train_entities = set([h for h, r, t in train_triples]).union(set([t for h, r, t in train_triples]))
    test_entities = set([h for h, r, t in test_triples]).union(set([t for h, r, t in test_triples]))
    train_relations = set([r for h, r, t in train_triples])
    test_relations = set([r for h, r, t in test_triples])

    print(f"训练集实体数: {len(train_entities)}")
    print(f"测试集实体数: {len(test_entities)}")
    print(f"训练集关系数: {len(train_relations)}")
    print(f"测试集关系数: {len(test_relations)}")
    print()

调用

In [39]:
# 实体和关系的数量
basic_stats(entity2id, relation2id)
# 训练、测试集分析
analyze_train_test_split(train_triples, test_triples)
# 关系频率分析
analyze_relation_frequencies(train_triples)
# 实体度分析
analyze_entity_degrees(train_triples)

实体总数: 162336
关系总数: 47

训练集实体数: 145227
测试集实体数: 59857
训练集关系数: 47
测试集关系数: 47

前5个最常见的关系:
关系: Causes  出现次数: 57801
关系: MotivatedByGoal  出现次数: 44408
关系: HasSubevent  出现次数: 32722
关系: IsA  出现次数: 28883
关系: RelatedTo  出现次数: 24962

前10个实体的度:
实体: 睡觉  度数: 4462
实体: 吃饭  度数: 3549
实体: 人  度数: 2728
实体: 快乐  度数: 2261
实体: 开心  度数: 2189
实体: 生气  度数: 2096
实体: 老师  度数: 1671
实体: 洗澡  度数: 1609
实体: 钱  度数: 1565
实体: 运动  度数: 1564

