In [9]:
import pandas as pd
import csv
import re
from collections import Counter

In [None]:
df = pd.read_csv('./kg1.csv')

# 提取三元组
triples = df[['subject', 'relation', 'object']]

# 提取实体及其类型
entity_types = pd.concat([
    df[['subject', 'subject_type']].rename(columns={'subject': 'entity', 'subject_type': 'type'}),
    df[['object', 'object_type']].rename(columns={'object': 'entity', 'object_type': 'type'})
]).drop_duplicates()

In [6]:
print(len(entity_types))

7936


In [8]:
triples[:2000].to_csv("triples_part1.csv", index=False)
# entity_types.to_csv("entity_types.csv", index=False)

In [36]:
file_path = './export4000.csv'
entity_pattern = r'name: "(.*?)".*?id: (\d+)'  # 提取实体名称和ID
relation_pattern = r'\[:(.*?)\]'              # 提取关系
out_data = []
entity_type_counter = {}

# 统计每个实体的类型出现次数
with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过标题行
    for data in reader:
        # 提取实体1和类型
        entity1_match = re.search(entity_pattern, data[0])
        if entity1_match:
            entity1_name = entity1_match.group(1)
            entity1_type = re.search(r'\(:([a-zA-Z_]+)', data[0]).group(1)

            # 统计类型出现次数
            if entity1_name not in entity_type_counter:
                entity_type_counter[entity1_name] = Counter()
            entity_type_counter[entity1_name][entity1_type] += 1

        # 提取实体2和类型
        entity2_match = re.search(entity_pattern, data[2])
        if entity2_match:
            entity2_name = entity2_match.group(1)
            entity2_type = re.search(r'\(:([a-zA-Z_]+)', data[2]).group(1)

            # 统计类型出现次数
            if entity2_name not in entity_type_counter:
                entity_type_counter[entity2_name] = Counter()
            entity_type_counter[entity2_name][entity2_type] += 1

# 找到每个实体出现次数最多的类型
entity_most_common_type = {
    entity: types.most_common(1)[0][0]
    for entity, types in entity_type_counter.items()
}

print(entity_most_common_type['periodontal disease'])
# 生成输出三元组，过滤掉非最多类型的实体
with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过标题行
    for data in reader:
        entity1_match = re.search(entity_pattern, data[0])
        entity2_match = re.search(entity_pattern, data[2])
        relation_match = re.search(relation_pattern, data[1])

        if entity1_match and entity2_match and relation_match:
            entity1_name = entity1_match.group(1)
            entity1_type = re.search(r'\(:([a-zA-Z_]+)', data[0]).group(1)
            entity2_name = entity2_match.group(1)
            entity2_type = re.search(r'\(:([a-zA-Z_]+)', data[2]).group(1)
            relation = relation_match.group(1)

            # 保留每个实体最多类型的记录
            if (
                entity1_type == entity_most_common_type[entity1_name] and
                entity2_type == entity_most_common_type[entity2_name]
            ):
                out_data.append({
                    'subject': entity1_name,
                    'relation': relation,
                    'object': entity2_name,
                })

# 转换为 DataFrame 并显示结果
out_df = pd.DataFrame(out_data)
print(out_df)

disease
                 subject                           relation  \
0       axial herniation             disease_caused_disease   
1     lateral herniation             disease_caused_disease   
2            hemiparesis            symptom_has_description   
3       axial herniation                disease_examination   
4     lateral herniation                disease_examination   
...                  ...                                ...   
3172           composite  medicine_contraindication_disease   
3173           composite            medicine_treats_disease   
3174               macro           examination_at_oral part   
3175               macro           examination_at_oral part   
3176             packing           examination_at_oral part   

                                                 object  
0                               generalized brain edema  
1                                unilateral mass effect  
2     pressing the opposite cerebral peduncle agains...  
3  

In [37]:
out_df.to_csv("triples_part5.csv", index=False)

In [41]:
entity_type_pattern = r'\(:([a-zA-Z_]+)'  # 提取实体类型
name_pattern = r'name: "(.*?)"'           # 提取实体名字

# 用于存储结果和类型统计
results = []
entity_type_counter = {}

# 统计每个实体的类型出现次数
with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过标题行
    for line in reader:
        # 提取第一个实体及类型
        entity_type_match_1 = re.search(entity_type_pattern, line[0])
        name_match_1 = re.search(name_pattern, line[0])
        if entity_type_match_1 and name_match_1:
            entity_type_1 = entity_type_match_1.group(1)
            name_1 = name_match_1.group(1)
            
            # 更新类型统计
            if name_1 not in entity_type_counter:
                entity_type_counter[name_1] = Counter()
            entity_type_counter[name_1][entity_type_1] += 1

        # 提取第二个实体及类型
        entity_type_match_2 = re.search(entity_type_pattern, line[2])
        name_match_2 = re.search(name_pattern, line[2])
        if entity_type_match_2 and name_match_2:
            entity_type_2 = entity_type_match_2.group(1)
            name_2 = name_match_2.group(1)

            # 更新类型统计
            if name_2 not in entity_type_counter:
                entity_type_counter[name_2] = Counter()
            entity_type_counter[name_2][entity_type_2] += 1

# 找到每个实体的最多类型
entity_most_common_type = {
    entity: types.most_common(1)[0][0]
    for entity, types in entity_type_counter.items()
}

# 筛选出符合最多类型的实体
with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过标题行
    for line in reader:
        entity_type_match_1 = re.search(entity_type_pattern, line[0])
        name_match_1 = re.search(name_pattern, line[0])
        entity_type_match_2 = re.search(entity_type_pattern, line[2])
        name_match_2 = re.search(name_pattern, line[2])

        if entity_type_match_1 and name_match_1 and entity_type_match_2 and name_match_2:
            entity_type_1 = entity_type_match_1.group(1)
            name_1 = name_match_1.group(1)
            entity_type_2 = entity_type_match_2.group(1)
            name_2 = name_match_2.group(1)

            # 仅保留每个实体为其最多类型的记录
            if (entity_type_1 == entity_most_common_type[name_1] and
                entity_type_2 == entity_most_common_type[name_2]):
                results.append({
                    "subject_type": entity_type_1,
                    "subject_name": name_1,
                })
                results.append({
                    "subject_type": entity_type_2,
                    "subject_name": name_2,
                })


# 转换为 DataFrame
df = pd.DataFrame(results)

# 显示 DataFrame 长度和内容
print(len(df))

6354


In [42]:
unique_df = df.drop_duplicates()
print(len(unique_df))

2850


In [43]:

unique_df.to_csv("unique_entities_part5.csv", index=False)