In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pandas as pd
import duckdb
import numpy as np
import altair as alt
import json
from urllib.request import urlopen
import time
import os
import plotly.express as px

In [2]:
poet = pd.read_csv('../../CCGIV/datasets/WomenWriting/poet.csv')
poem = pd.read_csv('../../CCGIV/datasets/WomenWriting/poem.csv')
poempoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/poempoetlinks.csv')
subwork = pd.read_csv('../../CCGIV/datasets/WomenWriting/subwork.csv')
subworkpoemlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/subworkpoemlinks.csv')
subworkpoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/subworkpoetlinks.csv')
work = pd.read_csv('../../CCGIV/datasets/WomenWriting/work.csv')
workpoemlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/workpoemlinks.csv')
workpoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/workpoetlinks.csv')
poetregionlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/poetregionlinks.csv')

con = duckdb.connect()
con.register('poet', poet)
con.register('poem', poem)
con.register('poempoetlinks', poempoetlinks)
con.register('subwork', subwork)
con.register('subworkpoemlinks', subworkpoemlinks)
con.register('subworkpoetlinks', subworkpoetlinks)
con.register('work', work)
con.register('workpoemlinks', workpoemlinks)
con.register('workpoetlinks', workpoetlinks)
con.register('poetregionlinks', poetregionlinks)

<duckdb.duckdb.DuckDBPyConnection at 0x10f34afb0>

# 1. 集（work）的 Importance：题辞作者数量、序作者数量、跋作者数量、收录的品（poem）的数量

## 1.1 题辞作者、序作者、跋作者不去重

### 1.1.1 题辞作者数量

In [3]:
ticiSQL = '''
SELECT workID,count(DISTINCT poetID) as ticicount
FROM workpoetlinks 
WHERE role IN ('題辭')
Group By workID
ORDER BY ticicount DESC
'''
tici = duckdb.query(ticiSQL).df()
tici.set_index('workID', inplace=True)

tici

Unnamed: 0_level_0,ticicount
workID,Unnamed: 1_level_1
125,119
223,96
218,73
352,49
112,43
...,...
390,1
61,1
427,1
299,1


### 1.1.2 序作者数量

In [4]:
xuSQL = '''
SELECT workID,count(DISTINCT poetID) as xucount
FROM workpoetlinks 
WHERE role IN ('序作者')
Group By workID
ORDER BY xucount DESC
'''
xu = duckdb.query(xuSQL).df()
xu.set_index('workID', inplace=True)
xu

Unnamed: 0_level_0,xucount
workID,Unnamed: 1_level_1
158,17
167,14
57,9
118,9
283,8
...,...
293,1
201,1
164,1
98,1


### 1.1.3 跋作者数量

In [5]:
baSQL = '''
SELECT workID,count(DISTINCT poetID) as bacount
FROM workpoetlinks 
WHERE role IN ('跋作者')
Group By workID
ORDER BY bacount DESC
'''
ba = duckdb.query(baSQL).df()
ba.set_index('workID', inplace=True)
ba

Unnamed: 0_level_0,bacount
workID,Unnamed: 1_level_1
186,5
204,4
15,4
335,3
275,3
...,...
353,1
435,1
334,1
322,1


### 1.1.4 题辞、序、跋汇总

In [6]:
from functools import reduce

TiciXuBa = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [tici, xu, ba]).fillna(0)
TiciXuBa

Unnamed: 0_level_0,ticicount,xucount,bacount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,0.0,3.0,1.0
10,2.0,2.0,0.0
11,0.0,3.0,0.0
12,1.0,5.0,1.0
13,0.0,1.0,0.0
...,...,...,...
476,1.0,4.0,0.0
477,14.0,5.0,1.0
479,1.0,4.0,1.0
480,3.0,5.0,0.0


## 1.2 题辞作者、序作者、跋作者去重

问题：在同一个集里有多重角色的作者，应该计数到哪种角色中？比如既是题辞作者又是序作者，算到哪类？

## 1.3 集收录的作品数

In [7]:
includedWorkSQL = '''
SELECT workID,count(poemID) as includedcount
FROM workpoemlinks 
Group By workID
ORDER BY includedcount DESC
'''
includedWork = duckdb.query(includedWorkSQL).df()
includedWork.set_index('workID', inplace=True)

includedWork

Unnamed: 0_level_0,includedcount
workID,Unnamed: 1_level_1
61,5075
120,3771
38,2478
63,2145
88,2052
...,...
341,9
358,7
225,7
373,3


## 1.4 集的题辞作者数、序作者数、跋作者数、收录作品数汇总

In [8]:
jiRawData = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [tici, xu, ba, includedWork]).fillna(0)
jiRawData

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.0,3.0,1.0,986.0
10,2.0,2.0,0.0,404.0
11,0.0,3.0,0.0,137.0
12,1.0,5.0,1.0,126.0
13,0.0,1.0,0.0,252.0
...,...,...,...,...
478,0.0,0.0,0.0,206.0
479,1.0,4.0,1.0,310.0
480,3.0,5.0,0.0,82.0
481,0.0,0.0,0.0,885.0


## 1.5 归一化

### 1.5.1 Min-Max归一化

In [9]:
def min_max_normalize(series):
    return (series - series.min()) / (series.max() - series.min()) * 100

In [10]:
ji_min_max_normalize = jiRawData.copy()
for column in ji_min_max_normalize.columns:
    if ji_min_max_normalize[column].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        ji_min_max_normalize[column] = min_max_normalize(ji_min_max_normalize[column])
ji_min_max_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.000000,17.647059,20.0,19.428571
10,1.680672,11.764706,0.0,7.960591
11,0.000000,17.647059,0.0,2.699507
12,0.840336,29.411765,20.0,2.482759
13,0.000000,5.882353,0.0,4.965517
...,...,...,...,...
478,0.000000,0.000000,0.0,4.059113
479,0.840336,23.529412,20.0,6.108374
480,2.521008,29.411765,0.0,1.615764
481,0.000000,0.000000,0.0,17.438424


### 1.5.2 对数归一化

In [42]:
ji_log_normalize = jiRawData.copy()

for column in ji_log_normalize.columns:  
    if ji_log_normalize[column].dtype in ['float64', 'int64']:
        max_val = ji_log_normalize[column].max()
        if max_val > 0:
            # 应用对数归一化
            ji_log_normalize[column] = np.log(ji_log_normalize[column] + 1) / np.log(max_val + 1)
        # 对数据加1以避免对0取对数的问题
        # ji_log_normalize[column] = np.log(ji_log_normalize[column] + 1)

ji_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.000000,0.479625,0.386853,0.808069
10,0.229476,0.380094,0.000000,0.703667
11,0.000000,0.479625,0.000000,0.577484
12,0.144783,0.619906,0.386853,0.567748
13,0.000000,0.239812,0.000000,0.648524
...,...,...,...,...
478,0.000000,0.000000,0.000000,0.625005
479,0.144783,0.556827,0.386853,0.672715
480,0.289566,0.619906,0.000000,0.517897
481,0.000000,0.000000,0.000000,0.795417


### 1.5.3 把题辞、序、跋统一归一化

#### 1.5.3.1 min-max

In [43]:
ji_combine_normalize = jiRawData.copy()

all_counts = np.concatenate((ji_combine_normalize['ticicount'], ji_combine_normalize['xucount'],ji_combine_normalize['bacount']))

min_val = all_counts.min()
max_val = all_counts.max()
ji_combine_normalize['normalized_bacount'] = (ji_combine_normalize['bacount'] - min_val) / (max_val - min_val)
ji_combine_normalize['normalized_ticicount'] = (ji_combine_normalize['ticicount'] - min_val) / (max_val - min_val)
ji_combine_normalize['normalized_xucount'] = (ji_combine_normalize['xucount'] - min_val) / (max_val - min_val)

def min_max_normalize_ori(series):
    return (series - series.min()) / (series.max() - series.min())
# 收录的作品数单独归一化
if ji_combine_normalize['includedcount'].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        ji_combine_normalize['includedcount'] = min_max_normalize_ori(ji_combine_normalize['includedcount'])

ji_combine_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,0.0,3.0,1.0,0.194286,0.008403,0.000000,0.025210
10,2.0,2.0,0.0,0.079606,0.000000,0.016807,0.016807
11,0.0,3.0,0.0,0.026995,0.000000,0.000000,0.025210
12,1.0,5.0,1.0,0.024828,0.008403,0.008403,0.042017
13,0.0,1.0,0.0,0.049655,0.000000,0.000000,0.008403
...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.040591,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.061084,0.008403,0.008403,0.033613
480,3.0,5.0,0.0,0.016158,0.000000,0.025210,0.042017
481,0.0,0.0,0.0,0.174384,0.000000,0.000000,0.000000


#### 1.5.3.2 对数归一化

In [44]:
ji_combine_log_normalize = jiRawData.copy()

all_counts = np.concatenate((ji_combine_log_normalize['ticicount'], ji_combine_log_normalize['xucount'],ji_combine_log_normalize['bacount']))

min_val = all_counts.min()
max_val = all_counts.max()

if max_val > 0:
    # ji_log_normalize[column] = np.log10(ji_log_normalize[column] + 1) / np.log10(max_val + 1)
    ji_combine_log_normalize['normalized_bacount'] = np.log10(ji_combine_log_normalize['bacount'] + 1) / np.log10(max_val + 1)
    ji_combine_log_normalize['normalized_ticicount'] = np.log10(ji_combine_log_normalize['ticicount'] + 1) / np.log10(max_val + 1)
    ji_combine_log_normalize['normalized_xucount'] = np.log10(ji_combine_log_normalize['xucount'] + 1) / np.log10(max_val + 1)

def min_max_normalize_ori(series):
    return (series - series.min()) / (series.max() - series.min())
# 收录的作品数单独归一化
if ji_combine_log_normalize['includedcount'].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
    max_val = ji_combine_log_normalize['includedcount'].max()
    if max_val > 0:
        ji_combine_log_normalize['includedcount'] = np.log10(ji_combine_log_normalize['includedcount'] + 1) / np.log10(max_val + 1)
        # ji_combine_log_normalize['includedcount'] = min_max_normalize_ori(ji_combine_log_normalize['includedcount'])

ji_combine_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,0.0,3.0,1.0,0.808069,0.144783,0.000000,0.289566
10,2.0,2.0,0.0,0.703667,0.000000,0.229476,0.229476
11,0.0,3.0,0.0,0.577484,0.000000,0.000000,0.289566
12,1.0,5.0,1.0,0.567748,0.144783,0.144783,0.374258
13,0.0,1.0,0.0,0.648524,0.000000,0.000000,0.144783
...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.625005,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.672715,0.144783,0.144783,0.336176
480,3.0,5.0,0.0,0.517897,0.000000,0.289566,0.374258
481,0.0,0.0,0.0,0.795417,0.000000,0.000000,0.000000


#### 1.5.3.3 min-max和对数归一化一起

In [45]:
ji_combine_two_normalize = jiRawData.copy()

all_counts = np.concatenate((ji_combine_two_normalize['ticicount'], ji_combine_two_normalize['xucount'],ji_combine_two_normalize['bacount']))

min_val = all_counts.min()
max_val = all_counts.max()
ji_combine_two_normalize['normalized_bacount'] = (ji_combine_two_normalize['bacount'] - min_val) / (max_val - min_val)
ji_combine_two_normalize['normalized_ticicount'] = (ji_combine_two_normalize['ticicount'] - min_val) / (max_val - min_val)
ji_combine_two_normalize['normalized_xucount'] = (ji_combine_two_normalize['xucount'] - min_val) / (max_val - min_val)


# 收录的作品数单独归一化
if ji_combine_two_normalize['includedcount'].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
    max_val = ji_combine_two_normalize['includedcount'].max()
    if max_val > 0:
        ji_combine_two_normalize['includedcount'] = np.log10(ji_combine_two_normalize['includedcount'] + 1) / np.log10(max_val + 1)

ji_combine_two_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,0.0,3.0,1.0,0.808069,0.008403,0.000000,0.025210
10,2.0,2.0,0.0,0.703667,0.000000,0.016807,0.016807
11,0.0,3.0,0.0,0.577484,0.000000,0.000000,0.025210
12,1.0,5.0,1.0,0.567748,0.008403,0.008403,0.042017
13,0.0,1.0,0.0,0.648524,0.000000,0.000000,0.008403
...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.625005,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.672715,0.008403,0.008403,0.033613
480,3.0,5.0,0.0,0.517897,0.000000,0.025210,0.042017
481,0.0,0.0,0.0,0.795417,0.000000,0.000000,0.000000


## 1.6 重要性计算

### 1.6.1 权重设置

In [46]:
xuweight = 0.25
baweight = 0.25
ticiweight = 0.25
includedweight = 0.25

### 1.6.2 Min-Max归一化的结果计算

In [47]:
ji_min_max = ji_min_max_normalize.copy()
ji_min_max['totalWeight'] = xuweight*ji_min_max['xucount'] + baweight*ji_min_max['bacount'] + ticiweight*ji_min_max['ticicount'] + includedweight*ji_min_max['includedcount']
ji_min_max

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,17.647059,20.0,19.428571,14.268908
10,1.680672,11.764706,0.0,7.960591,5.351492
11,0.000000,17.647059,0.0,2.699507,5.086642
12,0.840336,29.411765,20.0,2.482759,13.183715
13,0.000000,5.882353,0.0,4.965517,2.711968
...,...,...,...,...,...
478,0.000000,0.000000,0.0,4.059113,1.014778
479,0.840336,23.529412,20.0,6.108374,12.619531
480,2.521008,29.411765,0.0,1.615764,8.387134
481,0.000000,0.000000,0.0,17.438424,4.359606


### 1.6.3 对数归一化的结果计算

In [48]:
ji_log = ji_log_normalize.copy()

ji_log['totalWeight'] = xuweight*ji_log['xucount'] + baweight*ji_log['bacount'] + ticiweight*ji_log['ticicount'] + includedweight*ji_log['includedcount']
ji_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,0.479625,0.386853,0.808069,0.418637
10,0.229476,0.380094,0.000000,0.703667,0.328309
11,0.000000,0.479625,0.000000,0.577484,0.264277
12,0.144783,0.619906,0.386853,0.567748,0.429823
13,0.000000,0.239812,0.000000,0.648524,0.222084
...,...,...,...,...,...
478,0.000000,0.000000,0.000000,0.625005,0.156251
479,0.144783,0.556827,0.386853,0.672715,0.440295
480,0.289566,0.619906,0.000000,0.517897,0.356842
481,0.000000,0.000000,0.000000,0.795417,0.198854



### 1.6.4 合并后的归一化结果计算

#### 1.6.4.1 min-max(best)

In [18]:
ji_combine = ji_combine_normalize.copy()

ji_combine['totalWeight'] = xuweight*ji_combine['normalized_xucount'] + baweight*ji_combine['normalized_bacount'] + ticiweight*ji_combine['normalized_ticicount'] + includedweight*ji_combine['includedcount']
ji_combine

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,0.0,3.0,1.0,0.194286,0.008403,0.000000,0.025210,0.056975
10,2.0,2.0,0.0,0.079606,0.000000,0.016807,0.016807,0.028305
11,0.0,3.0,0.0,0.026995,0.000000,0.000000,0.025210,0.013051
12,1.0,5.0,1.0,0.024828,0.008403,0.008403,0.042017,0.020913
13,0.0,1.0,0.0,0.049655,0.000000,0.000000,0.008403,0.014515
...,...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.040591,0.000000,0.000000,0.000000,0.010148
479,1.0,4.0,1.0,0.061084,0.008403,0.008403,0.033613,0.027876
480,3.0,5.0,0.0,0.016158,0.000000,0.025210,0.042017,0.020846
481,0.0,0.0,0.0,0.174384,0.000000,0.000000,0.000000,0.043596


#### 1.6.4.2 对数归一化结果计算

In [19]:
ji_combine_log = ji_combine_log_normalize.copy()

ji_combine_log['totalWeight'] = xuweight*ji_combine_log['normalized_xucount'] + baweight*ji_combine_log['normalized_bacount'] + ticiweight*ji_combine_log['normalized_ticicount'] + includedweight*ji_combine_log['includedcount']
ji_combine_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,0.0,3.0,1.0,0.808069,0.144783,0.000000,0.289566,0.310604
10,2.0,2.0,0.0,0.703667,0.000000,0.229476,0.229476,0.290655
11,0.0,3.0,0.0,0.577484,0.000000,0.000000,0.289566,0.216762
12,1.0,5.0,1.0,0.567748,0.144783,0.144783,0.374258,0.307893
13,0.0,1.0,0.0,0.648524,0.000000,0.000000,0.144783,0.198327
...,...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.625005,0.000000,0.000000,0.000000,0.156251
479,1.0,4.0,1.0,0.672715,0.144783,0.144783,0.336176,0.324614
480,3.0,5.0,0.0,0.517897,0.000000,0.289566,0.374258,0.295430
481,0.0,0.0,0.0,0.795417,0.000000,0.000000,0.000000,0.198854


### 把集的册书也考虑进来呢

In [20]:
workCeSQL = '''
SELECT workID, NumberCe
FROM work 
ORDER BY NumberCe DESC
'''
workCe = duckdb.query(workCeSQL).df()
workCe.set_index('workID', inplace=True)

workCe

Unnamed: 0_level_0,NumberCe
workID,Unnamed: 1_level_1
460,24.0
61,20.0
195,20.0
88,18.0
193,16.0
...,...
211,
274,
344,
363,


In [21]:
jiRawDataWithCe = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [jiRawData,workCe]).fillna(0)
jiRawDataWithCe

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,NumberCe
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.0,3.0,1.0,986.0,6.0
10,2.0,2.0,0.0,404.0,4.0
11,0.0,3.0,0.0,137.0,3.0
12,1.0,5.0,1.0,126.0,2.0
13,0.0,1.0,0.0,252.0,2.0
...,...,...,...,...,...
478,0.0,0.0,0.0,206.0,1.0
479,1.0,4.0,1.0,310.0,1.0
480,3.0,5.0,0.0,82.0,1.0
481,0.0,0.0,0.0,885.0,2.0


In [22]:
ji_ce_log_normalize = jiRawDataWithCe.copy()

for column in ji_ce_log_normalize.columns:  
    if ji_ce_log_normalize[column].dtype in ['float64', 'int64']:
        # 对数据加1以避免对0取对数的问题
        ji_ce_log_normalize[column] = np.log(ji_ce_log_normalize[column] + 1)

ji_ce_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,NumberCe
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,1.386294,0.693147,6.894670,1.945910
10,1.098612,1.098612,0.000000,6.003887,1.609438
11,0.000000,1.386294,0.000000,4.927254,1.386294
12,0.693147,1.791759,0.693147,4.844187,1.098612
13,0.000000,0.693147,0.000000,5.533389,1.098612
...,...,...,...,...,...
478,0.000000,0.000000,0.000000,5.332719,0.693147
479,0.693147,1.609438,0.693147,5.739793,0.693147
480,1.386294,1.791759,0.000000,4.418841,0.693147
481,0.000000,0.000000,0.000000,6.786717,1.098612


In [23]:
xuweight1 = 0.2
baweight1 = 0.2
ticiweight1 = 0.2
includedweight1 = 0.2
ceWeight = 0.2

ji_ce_log = ji_ce_log_normalize.copy()

ji_ce_log['totalWeight'] = xuweight1*ji_ce_log['xucount'] + baweight1*ji_ce_log['bacount'] + ticiweight1*ji_ce_log['ticicount'] + includedweight1*ji_ce_log['includedcount'] + ceWeight*ji_ce_log['NumberCe']

ji_ce_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,NumberCe,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9,0.000000,1.386294,0.693147,6.894670,1.945910,2.184004
10,1.098612,1.098612,0.000000,6.003887,1.609438,1.962110
11,0.000000,1.386294,0.000000,4.927254,1.386294,1.539968
12,0.693147,1.791759,0.693147,4.844187,1.098612,1.824171
13,0.000000,0.693147,0.000000,5.533389,1.098612,1.465030
...,...,...,...,...,...,...
478,0.000000,0.000000,0.000000,5.332719,0.693147,1.205173
479,0.693147,1.609438,0.693147,5.739793,0.693147,1.885734
480,1.386294,1.791759,0.000000,4.418841,0.693147,1.658008
481,0.000000,0.000000,0.000000,6.786717,1.098612,1.577066


# 2. 作者（poet）的Importance：参与制作书的次数（包括被收录和参与编辑）、写像赞的次数、被写像赞的次数、收到唱和诗及书信的次数、被讨论的次数

## 2.1 参与制作书的次数

In [24]:
PoetParticipateWorkSQL = '''SELECT poetID, COUNT(DISTINCT workID) AS participate_count
    FROM workpoetlinks
    GROUP BY poetID
    ORDER BY participate_count DESC'''

PoetParticipateWork = duckdb.query(PoetParticipateWorkSQL).df()
PoetParticipateWork.set_index('poetID', inplace=True)

PoetParticipateWork

Unnamed: 0_level_0,participate_count
poetID,Unnamed: 1_level_1
263,13
308,9
199,9
202,7
4636,7
...,...
6980,1
6024,1
6042,1
8316,1


## 2.2 写像赞的次数

In [25]:
xiangzanauthorSQL = '''
SELECT poempoetlinks.poetID,COUNT(DISTINCT poem.poemID) AS xiangzanauthor_count
FROM poem,poempoetlinks
WHERE poem.GenreHZ='文﹕像贊' AND poem.poemID = poempoetlinks.poemID
GROUP BY poempoetlinks.poetID
ORDER BY xiangzanauthor_count
'''
xiangzanauthor = duckdb.query(xiangzanauthorSQL).df()
xiangzanauthor.set_index('poetID', inplace=True)
xiangzanauthor

Unnamed: 0_level_0,xiangzanauthor_count
poetID,Unnamed: 1_level_1
7279,1
6482,1
1977,1
3813,1
244,1
7006,1
6430,1
6494,1
4424,1
225,2


## 2.3 被写像赞的次数

In [26]:
bexiangzanSQL = '''
SELECT poetassubjectID AS poetID,COUNT(DISTINCT poemID) AS bexiangzansubjectCount
FROM poem
WHERE poem.GenreHZ='文﹕像贊' AND poetassubjectID !=0
GROUP BY poetassubjectID
'''
bexiangzan = duckdb.query(bexiangzanSQL).df()
bexiangzan.set_index('poetID', inplace=True)

bexiangzan

Unnamed: 0_level_0,bexiangzansubjectCount
poetID,Unnamed: 1_level_1
5,1
4939,1


## 2.4 被讨论的次数

In [27]:
discussedSQL = '''
SELECT poetassubjectID AS poetID, COUNT(DISTINCT poemID) AS discussedCount
FROM poem
WHERE GenreHZ!='文﹕像贊' AND poetassubjectID != 0
GROUP BY poetassubjectID
'''
discussed = duckdb.query(discussedSQL).df()
discussed.set_index('poetID', inplace=True)

discussed

Unnamed: 0_level_0,discussedCount
poetID,Unnamed: 1_level_1
1986,4
3244,1
1380,14
2003,4
2066,3
...,...
5414,1
5458,1
5479,1
5029,1


## 2.5 收到唱和书信的次数

In [28]:
changheshiSQL = '''
SELECT changheshuxinpoetID AS poetID, COUNT(DISTINCT poemID) AS changheshiCount
FROM poem
WHERE changheshuxinpoetID !=0
GROUP BY changheshuxinpoetID
'''
changheshi = duckdb.query(changheshiSQL).df()
changheshi.set_index('poetID', inplace=True)

changheshi

Unnamed: 0_level_0,changheshiCount
poetID,Unnamed: 1_level_1
1260,1
92,10
15,8
32,7
1380,55
...,...
7101,1
529,1
4797,1
5416,1


## 2.6 初始数据汇总

In [29]:
poetRawData = reduce(lambda left, right: pd.merge(left, right, on='poetID', how='outer'), [PoetParticipateWork, xiangzanauthor, bexiangzan,discussed, changheshi]).fillna(0)
poetRawData

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1.0,0.0,1.0,38.0,0.0
6,1.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,2.0,0.0
11,1.0,0.0,0.0,13.0,0.0
12,2.0,0.0,0.0,28.0,0.0
...,...,...,...,...,...
8457,1.0,0.0,0.0,0.0,0.0
8458,1.0,0.0,0.0,0.0,0.0
8459,1.0,0.0,0.0,0.0,0.0
8460,1.0,0.0,0.0,0.0,0.0


## 2.7 归一化

### 2.7.1 Min-Max归一化

In [30]:
poet_min_max_normalize = poetRawData.copy()
for column in poet_min_max_normalize.columns:
    if poet_min_max_normalize[column].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        poet_min_max_normalize[column] = min_max_normalize(poet_min_max_normalize[column])
poet_min_max_normalize

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,7.692308,0.0,100.0,49.350649,0.0
6,7.692308,0.0,0.0,0.000000,0.0
8,7.692308,0.0,0.0,2.597403,0.0
11,7.692308,0.0,0.0,16.883117,0.0
12,15.384615,0.0,0.0,36.363636,0.0
...,...,...,...,...,...
8457,7.692308,0.0,0.0,0.000000,0.0
8458,7.692308,0.0,0.0,0.000000,0.0
8459,7.692308,0.0,0.0,0.000000,0.0
8460,7.692308,0.0,0.0,0.000000,0.0


### 2.7.2 对数归一化

In [50]:
poet_log_normalize = poetRawData.copy()

for column in poet_log_normalize.columns:  
    if poet_log_normalize[column].dtype in ['float64', 'int64']:
        # 对数据加1以避免对0取对数的问题
        max_val = poet_log_normalize[column].max()
        if max_val > 0:
            # 应用对数归一化
            poet_log_normalize[column] = np.log(poet_log_normalize[column] + 1) / np.log(max_val + 1)
poet_log_normalize

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,0.26265,0.0,1.0,0.840901,0.0
6,0.26265,0.0,0.0,0.000000,0.0
8,0.26265,0.0,0.0,0.252166,0.0
11,0.26265,0.0,0.0,0.605746,0.0
12,0.41629,0.0,0.0,0.772899,0.0
...,...,...,...,...,...
8457,0.26265,0.0,0.0,0.000000,0.0
8458,0.26265,0.0,0.0,0.000000,0.0
8459,0.26265,0.0,0.0,0.000000,0.0
8460,0.26265,0.0,0.0,0.000000,0.0


## 2.8 重要性计算

### 2.8.1 权重设置
poet4258姚倚雲 写了很多唱和诗给丈夫 poet7909范當世，导致范當世收到唱和书信的次数很高，需要降低该项权重

In [51]:
participateWeight = 0.4
writeXZWeight = 0.3
inXZWeight = 0.1
bediscussedWeight = 0.2
changheWeight = 0.1

### 2.8.2 Min-Max归一化结果计算

In [52]:
poet_min_max = poet_min_max_normalize.copy()

poet_min_max['totalWeight'] = participateWeight*poet_min_max['participate_count'] + writeXZWeight*poet_min_max['xiangzanauthor_count']+inXZWeight*poet_min_max['bexiangzansubjectCount']+bediscussedWeight*poet_min_max['discussedCount']+changheWeight*poet_min_max['changheshiCount']

poet_min_max

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,totalWeight
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,7.692308,0.0,100.0,49.350649,0.0,22.947053
6,7.692308,0.0,0.0,0.000000,0.0,3.076923
8,7.692308,0.0,0.0,2.597403,0.0,3.596404
11,7.692308,0.0,0.0,16.883117,0.0,6.453546
12,15.384615,0.0,0.0,36.363636,0.0,13.426573
...,...,...,...,...,...,...
8457,7.692308,0.0,0.0,0.000000,0.0,3.076923
8458,7.692308,0.0,0.0,0.000000,0.0,3.076923
8459,7.692308,0.0,0.0,0.000000,0.0,3.076923
8460,7.692308,0.0,0.0,0.000000,0.0,3.076923


### 2.8.3 对数归一化结果计算

In [53]:
poet_log = poet_log_normalize.copy()

poet_log['totalWeight'] = participateWeight*poet_log['participate_count'] + writeXZWeight*poet_log['xiangzanauthor_count']+inXZWeight*poet_log['bexiangzansubjectCount']+bediscussedWeight*poet_log['discussedCount']+changheWeight*poet_log['changheshiCount']

poet_log

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,totalWeight
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,0.26265,0.0,1.0,0.840901,0.0,0.373240
6,0.26265,0.0,0.0,0.000000,0.0,0.105060
8,0.26265,0.0,0.0,0.252166,0.0,0.155493
11,0.26265,0.0,0.0,0.605746,0.0,0.226209
12,0.41629,0.0,0.0,0.772899,0.0,0.321096
...,...,...,...,...,...,...
8457,0.26265,0.0,0.0,0.000000,0.0,0.105060
8458,0.26265,0.0,0.0,0.000000,0.0,0.105060
8459,0.26265,0.0,0.0,0.000000,0.0,0.105060
8460,0.26265,0.0,0.0,0.000000,0.0,0.105060


In [54]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import requests

app = dash.Dash(__name__,suppress_callback_exceptions=True)

app.layout = html.Div([
    html.Div([
        dcc.Graph(id='work-distribution-chart', style={'height': '95vh', 'width': '100vw'}),
        html.Div(id='work-name-output')
    ], style={'height': '100vh'}),
    html.Div([
        dcc.Graph(id='poet-distribution-chart', style={'height': '95vh', 'width': '100vw'}),
        html.Div(id='poet-info-output')
    ], style={'height': '100vh'})
])

ji_log_reset = ji_log.reset_index()
poet_log_reset = poet_log.reset_index()

jidata = ji_log['totalWeight'].value_counts(sort=True,ascending=True).reset_index().sort_values(by='totalWeight', ascending=False)
poetdata = poet_log['totalWeight'].value_counts(sort=True,ascending=True).reset_index().sort_values(by='totalWeight', ascending=False)


# API 调用函数
def fetch_work_from_api(id):
    response = requests.get(f"http://localhost:8000/work/{id}")
    print(response)
    if response.status_code == 200 and len(response.json())>0:
        return str(response.json()[0]['workID']) + str(response.json()[0]['TitleHZ'])
    else:
        return f"Work{id} not found"
    
def fetch_poet_from_api(id):
    response = requests.get(f"http://localhost:8000/poet/{id}")
    print(response)
    if response.status_code == 200 and len(response.json())>0:
        return str(response.json()[0]['poetID']) + str(response.json()[0]['NameHZ'])
    else:
        return f"Poet{id} not found"

# 更新显示的数据
@app.callback(
    Output('work-distribution-chart', 'figure'),
    Output('work-name-output', 'children'),
    Input('work-distribution-chart', 'hoverData')
)
def update_graph_and_display_name(hoverData):
    # 绘制 Total 值的分布
    fig = px.line(jidata, x='totalWeight', y='count', text='count', title="Distribution of work Total Values")
   
    work_name = "Hover over a bar to see the work's name."
    if hoverData:
        total_value = float(hoverData['points'][0]['x'])
        print(total_value)
        ids = ji_log_reset[ji_log_reset['totalWeight'] == total_value]['workID'].tolist()
        work_names = "｜ ".join([fetch_work_from_api(id) for id in ids])
        
        work_name = f"work: {work_names}"
        fig.update_layout(annotations=[dict(x=total_value, y=hoverData['points'][0]['y'],
                                                text=work_names, showarrow=True, arrowhead=1)])
    return fig, work_name

@app.callback(
    Output('poet-distribution-chart', 'figure'),
    Output('poet-info-output', 'children'),
    Input('poet-distribution-chart', 'hoverData')
)

def update_poet_graph(hoverData):
    fig = px.line(poetdata, x='totalWeight', y='count', text='count', title="Distribution of Poet Weights")
    
    poet_name = "Hover over a bar to see poet details."
    if hoverData:
        total_value = float(hoverData['points'][0]['x'])
        ids = poet_log_reset[poet_log_reset['totalWeight'] == total_value]['poetID'].tolist()
        poet_names = "｜".join([fetch_poet_from_api(id) for id in ids])
        
        poet_name = f"Poet: {poet_names}"
        fig.update_layout(annotations=[dict(x=total_value, y=hoverData['points'][0]['y'],
                                                text=poet_names, showarrow=True, arrowhead=1)])
    return fig, poet_name

# 运行应用
if __name__ == '__main__':
    app.run_server(debug=True)

In [62]:
poetregionSQL = '''
SELECT poet.*, poetregionlinks.regionID
FROM poet
LEFT JOIN poetregionlinks ON poet.poetID = poetregionlinks.poetID
'''
poetregion = duckdb.query(poetregionSQL).df()
poetregion

Unnamed: 0,poetID,NameHZ,NamePY,HaoHZ,HaoPY,ZiHZ,ZiPY,OtherZiHaoHZ,OtherZiHaoPY,MaritalStatus,...,StartYear,EndYear,EthnicGroup,MainWorks,ispoet,HuWenKai,zhuLu,xuZuoZhe,baZuoZhe,regionID
0,5,甘立媃,Gan Lirou,,,如玉,Ruyu,,,正室﹐寡婦,...,1743,1819,漢,"《咏雪樓稿 : 5卷, 卷首1卷, 附1卷》",1,267,江西通志,宋鎔﹐王若閎序﹐甘立媃自序﹐劉彬士﹑顧皋作墓志詺﹐茹棻作像贊,徐心田跋,31.0
1,6,劉慧娟,Liu Huijuan,幻花女史,Huanhuanüshi,湘舲,Xiangling,,,正室﹐寡婦,...,1830,1880,漢,《曇花閣詩鈔四卷》,1,719,廣東女子藝文考,戴鴻慈﹐梁煦南序﹐劉慧娟自序,,189.0
2,8,馮思慧,Feng Sihui,,,睿之,Ruizhi,駱思慧,Luo Sihui,正室,...,1748,1774,漢,《繡餘吟六卷附錄一卷》,1,654,山西通志﹐正始續集﹐明媛詩話﹐擷芳集,劉秉恬序,,50.0
3,11,屠鏡心,Tu Jingxin,掃花主人,Saohuazhuren,,,,,正室,...,1796,1860,漢,《玩月軒詩草》《爨餘吟二卷》,1,627,清代閨閣詩人徵略﹐吳氏小殘卷齋數目,張之縉﹐文廉﹐耀曾序,"任珮瑛,萬秉,任凱,任治,任謙吉跋",140.0
4,12,鄭蘭孫,Zheng Lansun,蘅洲,Hengzhou,娛清,Yuqing,,,正室,...,1814,1861,漢,《蓮因室詩集二卷詞集一卷》﹐《都梁香閣詩詞集》,1,744,杭州府志﹐小檀欒室彙刻百家閨秀詞﹐小黛軒論詩詩﹐閨秀詞話,徐鴻謨﹐錢士杓序﹐鄭蘭孫自序﹐俞繡孫﹐孫因培﹐張煒﹐顧琇瑩﹐鍾維則﹐秦緗業﹐楊昌濬﹐孫念培﹐...,許樾身跋﹐徐琪附記,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7746,8419,陳希文,Chen Xiwen,,,,,,,正室,...,1769,1848,漢,,1,,,,,
7747,8424,潘宗秇,Pan Zongyi,,,小江,Xiaojiang,,,,...,1736,1850,漢,,1,,,,,
7748,8425,陳邦泰,Chen Bangtai,,,,,,,,...,1736,1850,漢,,1,,,,,
7749,8430,程雲,Cheng Yun,,,頑石,Wanshi,,,,...,1796,1861,漢,,0,,,,,


In [123]:
columns_to_display = [col for col in poetregion.columns if 'PY' not in col]

alt.renderers.enable('default')
alt.data_transformers.disable_max_rows()

poet_long = poetregion.reset_index().melt(id_vars='poetID', value_vars=columns_to_display)
poet_long['non_null'] = poet_long['value'].notna()
poet_long['value_or_null'] = poet_long['value'].fillna('Empty') 

chart = alt.Chart(poet_long).mark_rect().encode(
    x=alt.X('variable:N', title='Column'),
    y=alt.Y('poetID:O', title='PoetID'),
    color=alt.Color('non_null:N', legend=None, scale=alt.Scale(domain=[True, False], range=['green', 'red'])),
    tooltip=[alt.Tooltip('poetID:N', title='Poet ID'), alt.Tooltip('variable:N', title='Column'), alt.Tooltip('value_or_null:N', title='Value')]
).properties(
    width=2000,
    height=30000
).configure_axis(
    grid=False
)

# 显示图表
chart.save('poetwithregion.html')

In [124]:
columns_to_display = [col for col in work.columns if 'PY' not in col]

alt.renderers.enable('default')
alt.data_transformers.disable_max_rows()

work_long = work.reset_index().melt(id_vars='workID', value_vars=columns_to_display)
work_long['non_null'] = work_long['value'].notna()
work_long['value_or_null'] = work_long['value'].fillna('Empty') 

chart = alt.Chart(work_long).mark_rect().encode(
    x=alt.X('variable:N', title='Column'),
    y=alt.Y('workID:O', title='WorkID'),
    color=alt.Color('non_null:N', legend=None, scale=alt.Scale(domain=[True, False], range=['green', 'red'])),
    tooltip=[alt.Tooltip('workID:N', title='WorkID'), alt.Tooltip('variable:N', title='Column'), alt.Tooltip('value_or_null:N', title='Value')]
).properties(
    width=2000,
    height=5000
).configure_axis(
    grid=False
)

# 显示图表
chart.save('work.html')

In [125]:
columns_to_display = [col for col in poem.columns if 'PY' not in col]

alt.renderers.enable('default')
alt.data_transformers.disable_max_rows()

poem_long = poem.reset_index().melt(id_vars='poemID',value_vars=columns_to_display)
poem_long['non_null'] = poem_long['value'].notna()
poem_long['value_or_null'] = poem_long['value'].fillna('Empty') 

chart = alt.Chart(poem_long).mark_rect().encode(
    x=alt.X('variable:N', title='Column'),
    y=alt.Y('poemID:O', title='PoemID'),
    color=alt.Color('non_null:N', legend=None, scale=alt.Scale(domain=[True, False], range=['green', 'red'])),
    tooltip=[alt.Tooltip('poemID:N', title='PoemID'), alt.Tooltip('variable:N', title='Column'), alt.Tooltip('value_or_null:N', title='Value')]
).properties(
    width=2000,
    height=30000
).configure_axis(
    grid=False
)

# 显示图表
chart.save('poem.html')

# 品（poem）重要性：所在集（work）的重要性、作者（poet）的重要性

In [36]:
workWeight = 0.5
poetWeight = 0.5

poemWorkPoetSQL = '''
SELECT poem.poemID, workpoemlinks.workID,poempoetlinks.poetID
FROM poem
LEFT JOIN workpoemlinks ON poem.poemID = workpoemlinks.poemID
LEFT JOIN poempoetlinks ON poem.poemID = poempoetlinks.poemID
'''

poemWorkPoet = duckdb.query(poemWorkPoetSQL).df()
# poemWorkPoet.set_index('poemID', inplace=True)
poemWorkPoet

Unnamed: 0,poemID,workID,poetID
0,1176,38.0,213.0
1,778,38.0,922.0
2,1175,38.0,213.0
3,6,38.0,494.0
4,7,38.0,495.0
...,...,...,...
92602,3314,,5.0
92603,44503,,5583.0
92604,52397,,6038.0
92605,52407,,6038.0


In [37]:
poemWorkPoet['workID'] = poemWorkPoet['workID'].fillna(0).astype(int).astype(str)
# poemWorkPoet['workID'] = poemWorkPoet['workID'].astype(str)
# ji_combine.index = ji_combine.index.astype(str)
weight_mapping = ji_combine['totalWeight'].reindex(poemWorkPoet['workID']).reset_index(drop=True)

poemWorkPoet = pd.concat([poemWorkPoet.reset_index(drop=True), weight_mapping.rename('workImportance')], axis=1)


poemWorkPoet

Unnamed: 0,poemID,workID,poetID,workImportance
0,1176,38,213.0,
1,778,38,922.0,
2,1175,38,213.0,
3,6,38,494.0,
4,7,38,495.0,
...,...,...,...,...
92602,3314,0,5.0,
92603,44503,0,5583.0,
92604,52397,0,6038.0,
92605,52407,0,6038.0,


In [38]:
poemWorkPoet['poetID'] = poemWorkPoet['poetID'].fillna(0).astype(int)
# poemWorkPoet['workID'] = poemWorkPoet['workID'].astype(str)
# ji_combine.index = ji_combine.index.astype(str)
weight_mapping = poet_log['totalWeight'].reindex(poemWorkPoet['poetID']).reset_index(drop=True)

poemWorkPoet = pd.concat([poemWorkPoet.reset_index(drop=True), weight_mapping.rename('poetImportance')], axis=1)


poemWorkPoet

Unnamed: 0,poemID,workID,poetID,workImportance,poetImportance
0,1176,38,213,,0.350102
1,778,38,922,,0.198336
2,1175,38,213,,0.350102
3,6,38,494,,0.031820
4,7,38,495,,0.031820
...,...,...,...,...,...
92602,3314,0,5,,0.373240
92603,44503,0,5583,,0.105060
92604,52397,0,6038,,0.120144
92605,52407,0,6038,,0.120144


In [249]:
workWeight = 0.5
poetWeight = 0.5

In [39]:
poetImportance = poemWorkPoet.copy()

poetImportance['totalWeight'] = workWeight * poetImportance['workImportance'] + poetWeight * poetImportance['poetImportance']

poetImportance

Unnamed: 0,poemID,workID,poetID,workImportance,poetImportance,totalWeight
0,1176,38,213,,0.350102,
1,778,38,922,,0.198336,
2,1175,38,213,,0.350102,
3,6,38,494,,0.031820,
4,7,38,495,,0.031820,
...,...,...,...,...,...,...
92602,3314,0,5,,0.373240,
92603,44503,0,5583,,0.105060,
92604,52397,0,6038,,0.120144,
92605,52407,0,6038,,0.120144,
