此文件主要内容为，将 **yagoSchema, yagoCountTaxonomy, yagoTaxonomy, yagoDateFacts, yagoFacts, yagoLabels, yagoLiteralFacts, yagoTransitiveType** 数据转化成为 **instance_code, predicate_code, attribute_value_type_code, attribute_value_code**

## 导包

In [1]:
import re # 正则表达式
import time
import numpy as np
import pandas as pd
from pprint import pprint # 超级打印
from ToolScript.readSql import sql_tool # 数据库工具 (自行参考源代码，封装后可以针对此电脑直接调用)
from sklearn.preprocessing import OrdinalEncoder # 热度编码工具

## 数据库工具

In [2]:
read_tool = sql_tool('Yago_pure')
deposit_tool = sql_tool('YagoCore')

## 读入表格

In [3]:
yagoSchema = read_tool.get_table('yagoSchema')
yagoCountTaxonomy = read_tool.get_table('yagoCountTaxonomy')
yagoTaxonomy = read_tool.get_table('yagoTaxonomy')
yagoDateFacts = read_tool.get_table('yagoDateFacts')
yagoFacts = read_tool.get_table('yagoFacts')
yagoLabels = read_tool.get_table('yagoLabels')
yagoLiteralFacts = read_tool.get_table('yagoLiteralFacts')
yagoTransitiveType = read_tool.get_table('yagoTransitiveType') # 此步骤需要10min

yagoSchema表读出成功!
yagoCountTaxonomy表读出成功!
yagoTaxonomy表读出成功!
yagoDateFacts表读出成功!
yagoFacts表读出成功!
yagoLabels表读出成功!
yagoLiteralFacts表读出成功!
yagoTransitiveType表读出成功!


**注：** 如果该步骤爆内存，将上面每行代码分成到多个cell中依次读入。

## instance_code 表的制作

In [4]:
# 此处的目的是将 table_list 中所有表的 subject列取出，并进行独热编码

table_list = [yagoTransitiveType, yagoDateFacts, yagoFacts, yagoLabels, yagoLiteralFacts] # 这里是将需要独热编码的table装在list里面，方便遍历

subject_ar = np.array([])

for table in table_list:
    # 取出来每个 table 中的 subject 装入 subject_ar 中
    subject_ar = np.concatenate((subject_ar, table['subject'].values), axis=0)
    
# 利用set() 函数进行 去重
pure_subject = np.array(list(set(subject_ar)))

# 对pure_subject进行独热编码，具体的用法请参考sklearn的官方文档
enc = OrdinalEncoder()

enc.fit(pure_subject.reshape(-1, 1)) # reshape的原因在于sklearn的接口限制，请参考sklearn的fit方法

encode_subject = enc.transform(pure_subject.reshape(-1, 1))
encode_subject = encode_subject + 1 # 下标从1开始，sklearn的独热编码是从0开始的，另外，可以用过广播机制让每个元素+1

# 将独热编码后的结果封装成DataFrame，以方便后面的操作
subject_df = pd.DataFrame({'instance': pure_subject, 'code': encode_subject.reshape(-1)})
subject_df.head() # 展示

Unnamed: 0,instance,code
0,<Monostiolum_harryleei>,2832350.0
1,<pl/Bart?omiej_Czychy>,6092769.0
2,<de/FC_Erfurt_Nord>,4724046.0
3,<The_Moors_Murderers>,4055215.0
4,<Newtown_Creek_Bridge>,2970688.0


In [5]:
deposit_table_name = 'instance_code'
deposit_tool.big_to_sql(subject_df, deposit_table_name)

  0%|          | 0/635 [00:00<?, ?it/s]

## predicate_code 表的制作

In [6]:
# 将 table_list 中的所有表中的 predicates进行独热编码
# 此cell的代码过程与之前解释过的几乎完全一致，不在过多解释。

table_list = [yagoDateFacts, yagoFacts, yagoLabels, yagoLiteralFacts]

predicates_ar = np.array([])

for table in table_list:
    predicates_ar = np.concatenate((predicates_ar, table['predicates'].values), axis=0)
    
pure_predicates = np.array(list(set(predicates_ar)))

enc = OrdinalEncoder()

enc.fit(pure_predicates.reshape(-1, 1))
encode_predicates = enc.transform(pure_predicates.reshape(-1, 1))
encode_predicates = encode_predicates + 1

# 其中 predicate 和 code 为列名
predicates_df = pd.DataFrame({'predicate': pure_predicates, 'code': encode_predicates.reshape(-1)})
predicates_df.head()

Unnamed: 0,predicate,code
0,<graduatedFrom>,9.0
1,<isInterestedIn>,58.0
2,<diedIn>,4.0
3,<hasISBN>,30.0
4,<exports>,8.0


In [7]:
deposit_table_name = 'predicate_code'
deposit_tool.big_to_sql(predicates_df, deposit_table_name)

  0%|          | 0/1 [00:00<?, ?it/s]

## attribute_value_type_code 表的制作

In [8]:
# 提取yagoSchema中的 predicates 为 'rdfs:range' 的所有行
yagoSchema = yagoSchema.loc[yagoSchema['predicates'] == 'rdfs:range', :]

# 针对表3
# 与前面的类似，不解释

table_list = [yagoTransitiveType, yagoSchema]

object_3 = pd.DataFrame([], columns=['object'])

for table in table_list:
    object_3 = pd.concat([object_3, table[['object']]])
    
# 提取了所有表的object后，使用duplicated()方法去重
object_3 = object_3.loc[object_3.duplicated() != True, :]
object_3 = object_3['object'].values

enc = OrdinalEncoder()

enc.fit(object_3.reshape(-1, 1))
encode_object_3 = enc.transform(object_3.reshape(-1, 1))
encode_object_3 = encode_object_3 + 1 # 使用np.array的广播机制，让所有值+1
object_3_df = pd.DataFrame({'attribute_value_type': object_3, 'code': encode_object_3.reshape(-1)})
object_3_df.head()

Unnamed: 0,attribute_value_type,code
0,<wikicat_Baltimore_Orioles_seasons>,89306.0
1,<wikicat_Major_League_Baseball_teams_seasons>,348349.0
2,<wikicat_St._Louis_Browns_seasons>,585038.0
3,<wordnet_abstraction_100002137>,665215.0
4,<wordnet_fundamental_quantity_113575869>,668856.0


In [9]:
deposit_table_name = 'attribute_value_type_code'
deposit_tool.big_to_sql(object_3_df, deposit_table_name)

  0%|          | 0/68 [00:00<?, ?it/s]

## attribute_value_code 表的制作

In [10]:
# 提取table_list 中所有表的 object ，进行独热编码，针对表4
# 此cell与前面的代码用法类似，不解释。

table_list = [yagoDateFacts, yagoFacts, yagoLabels, yagoLiteralFacts]

object_4 = pd.DataFrame([], columns=['object'])

for table in table_list:
    object_4 = pd.concat([object_4, table[['object']]])

# 从四个表中得到的所有的object进行去重复，并获取其值(得到的为np.array类型)
object_4 = object_4.loc[object_4['object'].duplicated() != True, 'object'].values

enc = OrdinalEncoder()

enc.fit(object_4.reshape(-1, 1))
encode_object_4 = enc.transform(object_4.reshape(-1, 1))
encode_object_4 = encode_object_4 + 1 # 下标从1开始
object_4_df = pd.DataFrame({'attribute_value': object_4, 'code': encode_object_4.reshape(-1)})
object_4_df.head()

Unnamed: 0,attribute_value,code
0,"""1919__##__##""^^xsd:date",650384.0
1,"""2010__##__##""^^xsd:date",1010266.0
2,"""2012__03__23""^^xsd:date",1038370.0
3,"""1990__##__##""^^xsd:date",819979.0
4,"""1983__02__03""^^xsd:date",791584.0


In [11]:
deposit_table_name = 'attribute_value_code'
deposit_tool.big_to_sql(object_4_df, deposit_table_name)

  0%|          | 0/4729 [00:00<?, ?it/s]

## 完成