In [3]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pandas as pd
import duckdb
import numpy as np
import altair as alt
import json
from urllib.request import urlopen
import time
import os
import plotly.express as px

In [4]:
poet = pd.read_csv('../../CCGIV/datasets/WomenWriting/poet.csv')
poem = pd.read_csv('../../CCGIV/datasets/WomenWriting/poem.csv')
poempoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/poempoetlinks.csv')
subwork = pd.read_csv('../../CCGIV/datasets/WomenWriting/subwork.csv')
subworkpoemlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/subworkpoemlinks.csv')
subworkpoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/subworkpoetlinks.csv')
work = pd.read_csv('../../CCGIV/datasets/WomenWriting/work.csv')
workpoemlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/workpoemlinks.csv')
workpoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/workpoetlinks.csv')
poetregionlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/poetregionlinks.csv')

con = duckdb.connect()
con.register('poet', poet)
con.register('poem', poem)
con.register('poempoetlinks', poempoetlinks)
con.register('subwork', subwork)
con.register('subworkpoemlinks', subworkpoemlinks)
con.register('subworkpoetlinks', subworkpoetlinks)
con.register('work', work)
con.register('workpoemlinks', workpoemlinks)
con.register('workpoetlinks', workpoetlinks)
con.register('poetregionlinks', poetregionlinks)

<duckdb.duckdb.DuckDBPyConnection at 0x13e7666b0>

# 1. 集（work）的 Importance：题辞作者数量、序作者数量、跋作者数量、收录的品（poem）的数量

## 1.1 题辞作者、序作者、跋作者不去重

### 1.1.1 题辞作者数量

In [5]:
ticiSQL = '''
SELECT workID,count(DISTINCT poetID) as ticicount
FROM workpoetlinks 
WHERE role IN ('題辭')
Group By workID
ORDER BY ticicount DESC
'''
tici = duckdb.query(ticiSQL).df()
tici.set_index('workID', inplace=True)

tici

Unnamed: 0_level_0,ticicount
workID,Unnamed: 1_level_1
125,119
223,96
218,73
352,49
112,43
...,...
12,1
215,1
294,1
142,1


### 1.1.2 序作者数量

In [6]:
xuSQL = '''
SELECT workID,count(DISTINCT poetID) as xucount
FROM workpoetlinks 
WHERE role IN ('序作者')
Group By workID
ORDER BY xucount DESC
'''
xu = duckdb.query(xuSQL).df()
xu.set_index('workID', inplace=True)
xu

Unnamed: 0_level_0,xucount
workID,Unnamed: 1_level_1
158,17
167,14
118,9
57,9
283,8
...,...
227,1
346,1
400,1
276,1


### 1.1.3 跋作者数量

In [7]:
baSQL = '''
SELECT workID,count(DISTINCT poetID) as bacount
FROM workpoetlinks 
WHERE role IN ('跋作者')
Group By workID
ORDER BY bacount DESC
'''
ba = duckdb.query(baSQL).df()
ba.set_index('workID', inplace=True)
ba

Unnamed: 0_level_0,bacount
workID,Unnamed: 1_level_1
186,5
15,4
204,4
458,3
50,3
...,...
379,1
287,1
75,1
462,1


### 1.1.4 题辞、序、跋汇总

In [8]:
from functools import reduce

TiciXuBa = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [tici, xu, ba]).fillna(0)
TiciXuBa

Unnamed: 0_level_0,ticicount,xucount,bacount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,0.0,3.0,1.0
10,2.0,2.0,0.0
11,0.0,3.0,0.0
12,1.0,5.0,1.0
13,0.0,1.0,0.0
...,...,...,...
476,1.0,4.0,0.0
477,14.0,5.0,1.0
479,1.0,4.0,1.0
480,3.0,5.0,0.0


## 1.2 题辞作者、序作者、跋作者去重

问题：在同一个集里有多重角色的作者，应该计数到哪种角色中？比如既是题辞作者又是序作者，算到哪类？

## 1.3 集收录的作品数

In [9]:
includedWorkSQL = '''
SELECT workID,count(poemID) as includedcount
FROM workpoemlinks 
Group By workID
ORDER BY includedcount DESC
'''
includedWork = duckdb.query(includedWorkSQL).df()
includedWork.set_index('workID', inplace=True)

includedWork

Unnamed: 0_level_0,includedcount
workID,Unnamed: 1_level_1
61,5075
120,3771
38,2478
63,2145
88,2052
...,...
341,9
358,7
225,7
373,3


## 1.4 集的题辞作者数、序作者数、跋作者数、收录作品数汇总

In [10]:
jiRawData = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [tici, xu, ba, includedWork]).fillna(0)
jiRawData

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.0,3.0,1.0,986.0
10,2.0,2.0,0.0,404.0
11,0.0,3.0,0.0,137.0
12,1.0,5.0,1.0,126.0
13,0.0,1.0,0.0,252.0
...,...,...,...,...
478,0.0,0.0,0.0,206.0
479,1.0,4.0,1.0,310.0
480,3.0,5.0,0.0,82.0
481,0.0,0.0,0.0,885.0



## 1.5 归一化

### 1.5.1 Min-Max归一化

In [11]:
def min_max_normalize(series):
    return (series - series.min()) / (series.max() - series.min()) * 100

In [12]:
ji_min_max_normalize = jiRawData.copy()
for column in ji_min_max_normalize.columns:
    if ji_min_max_normalize[column].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        ji_min_max_normalize[column] = min_max_normalize(ji_min_max_normalize[column])
ji_min_max_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.000000,17.647059,20.0,19.428571
10,1.680672,11.764706,0.0,7.960591
11,0.000000,17.647059,0.0,2.699507
12,0.840336,29.411765,20.0,2.482759
13,0.000000,5.882353,0.0,4.965517
...,...,...,...,...
478,0.000000,0.000000,0.0,4.059113
479,0.840336,23.529412,20.0,6.108374
480,2.521008,29.411765,0.0,1.615764
481,0.000000,0.000000,0.0,17.438424


### 1.5.2 对数归一化

In [13]:
ji_log_normalize = jiRawData.copy()

for column in ji_log_normalize.columns:  
    if ji_log_normalize[column].dtype in ['float64', 'int64']:
        max_val = ji_log_normalize[column].max()
        if max_val > 0:
            # 应用对数归一化
            ji_log_normalize[column] = np.log(ji_log_normalize[column] + 1) / np.log(max_val + 1)
        # 对数据加1以避免对0取对数的问题
        # ji_log_normalize[column] = np.log(ji_log_normalize[column] + 1)

ji_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.000000,0.479625,0.386853,0.808069
10,0.229476,0.380094,0.000000,0.703667
11,0.000000,0.479625,0.000000,0.577484
12,0.144783,0.619906,0.386853,0.567748
13,0.000000,0.239812,0.000000,0.648524
...,...,...,...,...
478,0.000000,0.000000,0.000000,0.625005
479,0.144783,0.556827,0.386853,0.672715
480,0.289566,0.619906,0.000000,0.517897
481,0.000000,0.000000,0.000000,0.795417


### 1.5.3 把题辞、序、跋统一归一化

#### 1.5.3.1 min-max

In [14]:
ji_combine_normalize = jiRawData.copy()

all_counts = np.concatenate((ji_combine_normalize['ticicount'], ji_combine_normalize['xucount'],ji_combine_normalize['bacount']))

min_val = all_counts.min()
max_val = all_counts.max()
ji_combine_normalize['normalized_bacount'] = (ji_combine_normalize['bacount'] - min_val) / (max_val - min_val)
ji_combine_normalize['normalized_ticicount'] = (ji_combine_normalize['ticicount'] - min_val) / (max_val - min_val)
ji_combine_normalize['normalized_xucount'] = (ji_combine_normalize['xucount'] - min_val) / (max_val - min_val)

def min_max_normalize_ori(series):
    return (series - series.min()) / (series.max() - series.min())
# 收录的作品数单独归一化
if ji_combine_normalize['includedcount'].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        ji_combine_normalize['includedcount'] = min_max_normalize_ori(ji_combine_normalize['includedcount'])

ji_combine_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,0.0,3.0,1.0,0.194286,0.008403,0.000000,0.025210
10,2.0,2.0,0.0,0.079606,0.000000,0.016807,0.016807
11,0.0,3.0,0.0,0.026995,0.000000,0.000000,0.025210
12,1.0,5.0,1.0,0.024828,0.008403,0.008403,0.042017
13,0.0,1.0,0.0,0.049655,0.000000,0.000000,0.008403
...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.040591,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.061084,0.008403,0.008403,0.033613
480,3.0,5.0,0.0,0.016158,0.000000,0.025210,0.042017
481,0.0,0.0,0.0,0.174384,0.000000,0.000000,0.000000


#### 1.5.3.2 对数归一化

In [15]:
ji_combine_log_normalize = jiRawData.copy()

all_counts = np.concatenate((ji_combine_log_normalize['ticicount'], ji_combine_log_normalize['xucount'],ji_combine_log_normalize['bacount']))

min_val = all_counts.min()
max_val = all_counts.max()

if max_val > 0:
    # ji_log_normalize[column] = np.log10(ji_log_normalize[column] + 1) / np.log10(max_val + 1)
    ji_combine_log_normalize['normalized_bacount'] = np.log10(ji_combine_log_normalize['bacount'] + 1) / np.log10(max_val + 1)
    ji_combine_log_normalize['normalized_ticicount'] = np.log10(ji_combine_log_normalize['ticicount'] + 1) / np.log10(max_val + 1)
    ji_combine_log_normalize['normalized_xucount'] = np.log10(ji_combine_log_normalize['xucount'] + 1) / np.log10(max_val + 1)

def min_max_normalize_ori(series):
    return (series - series.min()) / (series.max() - series.min())
# 收录的作品数单独归一化
if ji_combine_log_normalize['includedcount'].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
    max_val = ji_combine_log_normalize['includedcount'].max()
    if max_val > 0:
        ji_combine_log_normalize['includedcount'] = np.log10(ji_combine_log_normalize['includedcount'] + 1) / np.log10(max_val + 1)
        # ji_combine_log_normalize['includedcount'] = min_max_normalize_ori(ji_combine_log_normalize['includedcount'])

ji_combine_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,0.0,3.0,1.0,0.808069,0.144783,0.000000,0.289566
10,2.0,2.0,0.0,0.703667,0.000000,0.229476,0.229476
11,0.0,3.0,0.0,0.577484,0.000000,0.000000,0.289566
12,1.0,5.0,1.0,0.567748,0.144783,0.144783,0.374258
13,0.0,1.0,0.0,0.648524,0.000000,0.000000,0.144783
...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.625005,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.672715,0.144783,0.144783,0.336176
480,3.0,5.0,0.0,0.517897,0.000000,0.289566,0.374258
481,0.0,0.0,0.0,0.795417,0.000000,0.000000,0.000000


#### 1.5.3.3 min-max和对数归一化一起

In [16]:
ji_combine_two_normalize = jiRawData.copy()

all_counts = np.concatenate((ji_combine_two_normalize['ticicount'], ji_combine_two_normalize['xucount'],ji_combine_two_normalize['bacount']))

min_val = all_counts.min()
max_val = all_counts.max()
ji_combine_two_normalize['normalized_bacount'] = (ji_combine_two_normalize['bacount'] - min_val) / (max_val - min_val)
ji_combine_two_normalize['normalized_ticicount'] = (ji_combine_two_normalize['ticicount'] - min_val) / (max_val - min_val)
ji_combine_two_normalize['normalized_xucount'] = (ji_combine_two_normalize['xucount'] - min_val) / (max_val - min_val)


# 收录的作品数单独归一化
if ji_combine_two_normalize['includedcount'].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
    max_val = ji_combine_two_normalize['includedcount'].max()
    if max_val > 0:
        ji_combine_two_normalize['includedcount'] = np.log10(ji_combine_two_normalize['includedcount'] + 1) / np.log10(max_val + 1)

ji_combine_two_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,0.0,3.0,1.0,0.808069,0.008403,0.000000,0.025210
10,2.0,2.0,0.0,0.703667,0.000000,0.016807,0.016807
11,0.0,3.0,0.0,0.577484,0.000000,0.000000,0.025210
12,1.0,5.0,1.0,0.567748,0.008403,0.008403,0.042017
13,0.0,1.0,0.0,0.648524,0.000000,0.000000,0.008403
...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.625005,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.672715,0.008403,0.008403,0.033613
480,3.0,5.0,0.0,0.517897,0.000000,0.025210,0.042017
481,0.0,0.0,0.0,0.795417,0.000000,0.000000,0.000000


## 1.6 重要性计算

### 1.6.1 权重设置

In [26]:
xuweight = 0.33
baweight = 0.33
ticiweight = 0.33
includedweight = 0.0

### 1.6.2 Min-Max归一化的结果计算

In [27]:
ji_min_max = ji_min_max_normalize.copy()
ji_min_max['totalWeight'] = xuweight*ji_min_max['xucount'] + baweight*ji_min_max['bacount'] + ticiweight*ji_min_max['ticicount'] + includedweight*ji_min_max['includedcount']
ji_min_max

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,17.647059,20.0,19.428571,12.423529
10,1.680672,11.764706,0.0,7.960591,4.436975
11,0.000000,17.647059,0.0,2.699507,5.823529
12,0.840336,29.411765,20.0,2.482759,16.583193
13,0.000000,5.882353,0.0,4.965517,1.941176
...,...,...,...,...,...
478,0.000000,0.000000,0.0,4.059113,0.000000
479,0.840336,23.529412,20.0,6.108374,14.642017
480,2.521008,29.411765,0.0,1.615764,10.537815
481,0.000000,0.000000,0.0,17.438424,0.000000


### 1.6.3 对数归一化的结果计算

In [28]:
ji_log = ji_log_normalize.copy()

ji_log['totalWeight'] = xuweight*ji_log['xucount'] + baweight*ji_log['bacount'] + ticiweight*ji_log['ticicount'] + includedweight*ji_log['includedcount']
ji_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,0.479625,0.386853,0.808069,0.285938
10,0.229476,0.380094,0.000000,0.703667,0.201158
11,0.000000,0.479625,0.000000,0.577484,0.158276
12,0.144783,0.619906,0.386853,0.567748,0.380009
13,0.000000,0.239812,0.000000,0.648524,0.079138
...,...,...,...,...,...
478,0.000000,0.000000,0.000000,0.625005,0.000000
479,0.144783,0.556827,0.386853,0.672715,0.359193
480,0.289566,0.619906,0.000000,0.517897,0.300126
481,0.000000,0.000000,0.000000,0.795417,0.000000



### 1.6.4 合并后的归一化结果计算

#### 1.6.4.1 min-max(best)

In [29]:
ji_combine = ji_combine_normalize.copy()

ji_combine['totalWeight'] = xuweight*ji_combine['normalized_xucount'] + baweight*ji_combine['normalized_bacount'] + ticiweight*ji_combine['normalized_ticicount'] + includedweight*ji_combine['includedcount']
ji_combine

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,0.0,3.0,1.0,0.194286,0.008403,0.000000,0.025210,0.011092
10,2.0,2.0,0.0,0.079606,0.000000,0.016807,0.016807,0.011092
11,0.0,3.0,0.0,0.026995,0.000000,0.000000,0.025210,0.008319
12,1.0,5.0,1.0,0.024828,0.008403,0.008403,0.042017,0.019412
13,0.0,1.0,0.0,0.049655,0.000000,0.000000,0.008403,0.002773
...,...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.040591,0.000000,0.000000,0.000000,0.000000
479,1.0,4.0,1.0,0.061084,0.008403,0.008403,0.033613,0.016639
480,3.0,5.0,0.0,0.016158,0.000000,0.025210,0.042017,0.022185
481,0.0,0.0,0.0,0.174384,0.000000,0.000000,0.000000,0.000000


In [30]:
import altair as alt
import pandas as pd

# 假设 ji_min_max 和 work 数据已加载，以下是处理步骤：

# 复制 ji_min_max 数据，包含指定列
df = ji_min_max[['totalWeight', 'ticicount', 'xucount', 'bacount', 'includedcount']].copy()
df['workID'] = df.index

# 检查 work 表中的 workID 列是否存在
if 'workID' not in work.columns:
    work['workID'] = work.index

# 定义一个函数，计算给定 workID 的缺失值数量
def count_missing_values(work_id):
    # 指定不检查的列
    columns_to_exclude = ['Summary', 'UniformTitle', 'VariantTitle']  # 替换为不需要检查的实际列名
    columns_to_exclude += [col for col in work.columns if 'PY' in col]
    
    # 获取除去排除列的所有列
    columns_to_check = [col for col in work.columns if col not in columns_to_exclude]

    # 查找对应的 workID 行，并计算除去排除列后的缺失值数目
    work_row = work[work['workID'] == work_id]
    if work_row.empty:
        return len(columns_to_check)  
    
    # print(columns_to_check)
    return work_row[columns_to_check].isnull().sum(axis=1).values[0]



# 计算缺失值数量，并将其添加到 df 中
df['missing_count'] = df['workID'].apply(count_missing_values)



# 按 totalWeight 排序
df = df.sort_values(by='totalWeight', ascending=False)

df = df.reset_index(drop=True)  # 重置索引并丢弃原来的索引
df['workID'] = df['workID'].astype(str) + "_" + (df.index + 1).astype(str)

# 提取 missing_count 列，用于创建柱状图
missing_counts = df[['workID', 'missing_count']]


# 移除 totalWeight 列，确保不在图中显示
df = df.drop(columns=['totalWeight'])

# 使用 melt 转换数据格式
df_melted = df.melt(id_vars=['workID'], var_name='category', value_name='value')

# 创建堆叠的 Streamgraph 图层
streamgraph = alt.Chart(df_melted).mark_area().encode(
    x=alt.X('workID:O', sort=None, axis=alt.Axis(title='Work ID', labelAngle=45, labelFontSize=10)),
    y=alt.Y('value:Q', stack='zero', axis=alt.Axis(title='Value')),  # 正常堆叠
    color=alt.Color('category:N', legend=alt.Legend(title="Category", orient="left")),
    tooltip=['workID:O', 'category:N', 'value:Q']  # 添加 tooltip 显示详细信息
).properties(
    width=3600,
    height=300
)

# 创建柱状图图层显示 missing_count
missing_bar = alt.Chart(missing_counts).mark_bar(color='red', opacity=0.6).encode(
    x=alt.X('workID:O', sort=None, axis=alt.Axis(title='Work ID', labelAngle=45, labelFontSize=10, labels=True)),
    y=alt.Y('missing_count:Q', axis=alt.Axis(title='Missing Count')),
    tooltip=['workID:O', 'missing_count:Q']
)

# 将两层图表合并
chart = alt.layer(streamgraph, missing_bar).resolve_scale(
    y='independent'  # 独立的 Y 轴，以便展示两个不同的度量
).properties(
    title="Streamgraph with Missing Count across Work IDs (Sorted by Total Weight)"
)

chart

In [233]:
# 获取前100行数据并重置索引
df_top_name = df.copy().reset_index()

# 将 df_top_name 和 work 的 workID 转换为字符串
df_top_name['workID'] = df_top_name['workID'].astype(str)
work['workID'] = work['workID'].astype(str)

# 去掉 workID 中 "_" 后面的内容
df_top_name['workID'] = df_top_name['workID'].str.split('_').str[0]

# 进行合并
df_top_name = df_top_name.merge(work[['workID', 'TitleHZ','worktype']], on='workID', how='left')

# 删除不再需要的索引列并重新设置索引
df_top_name = df_top_name.set_index('workID')

# 选择需要的列
df_top_name = df_top_name[['index', 'TitleHZ','worktype']].copy()

df_top_name

Unnamed: 0_level_0,index,TitleHZ,worktype
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
158,0,餐菊軒詩草: 一卷(清伍淡如撰）,別集
125,1,雅安書屋文集﹕二卷(清汪嫈撰),別集
186,2,棣華館詩課:十二卷(清張晉禮輯),總集
167,3,綠雲山房詩草:二卷(清勞蓉君撰),別集
223,4,琴韻樓詩:二卷(清胡緣撰),別集
...,...,...,...
232,428,張淑蓮詩稿:一卷(清張淑蓮撰),別集
247,429,繡餘草:一卷(清趙玉樓撰),別集
225,430,澹仙賦鈔:一卷(清熊璉撰),別集
373,431,穗惟淚草:一卷(清佟佳氏撰),別集


In [234]:
df_top_10_percent = df_top_name.head(int(len(df) * 0.1)).copy()
df_top_10_percent = df_top_10_percent[df_top_10_percent['worktype'] == '別集'].copy()

# 从前 10% 的数据中随机抽取 30 条
sample_30_from_top_10_percent = df_top_10_percent.sample(n=30, random_state=42)

# 查看结果
sample_30_from_top_10_percent = sample_30_from_top_10_percent.sort_values(by='index')

sample_30_from_top_10_percent

Unnamed: 0_level_0,index,TitleHZ,worktype
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
158,0,餐菊軒詩草: 一卷(清伍淡如撰）,別集
125,1,雅安書屋文集﹕二卷(清汪嫈撰),別集
167,3,綠雲山房詩草:二卷(清勞蓉君撰),別集
223,4,琴韻樓詩:二卷(清胡緣撰),別集
352,6,壽筠簃詩稿:一卷(清陳麗芳撰),別集
112,7,冷吟仙舘詩稿﹕八卷﹐詩餘一卷﹐文存一卷(淸左錫嘉撰),別集
15,9,爨餘吟﹕二卷(清屠鏡心撰),別集
458,11,"鴻雪樓詩選初集:四卷,鴻雪樓外集:一卷(清沈善寶撰)",別集
204,12,紉餘漫草:一卷(清王慧增撰),別集
443,14,"不櫛吟:三卷,續刻:一卷(清潘素心撰)",別集


In [235]:
df_bottom_10_percent = df_top_name.tail(int(len(df) * 0.1)).copy()
df_bottom_10_percent = df_bottom_10_percent[df_bottom_10_percent['worktype'] == '別集'].copy()

# 从前 10% 的数据中随机抽取 30 条
sample_30_from_bottom_10_percent = df_bottom_10_percent.sample(n=30, random_state=42)

# 查看结果
sample_30_from_bottom_10_percent = sample_30_from_bottom_10_percent.sort_values(by='index')

sample_30_from_bottom_10_percent

Unnamed: 0_level_0,index,TitleHZ,worktype
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
284,391,昭如女子詩鈔:一卷(清王麟書撰),別集
81,392,海棠居詩集(清姚淑撰),別集
381,393,藤花閣詩草:一卷(清虞叶蘩撰),別集
142,394,聞妙香室詞:一卷(清陸珊撰）,別集
397,395,梅修館詩存:一卷(清陸韻珊撰),別集
422,396,"合存詩抄:三卷(清宮淡亭,思栢撰)",別集
279,397,含芳館詩草:一卷(清嚴澂華撰),別集
290,399,眷仙樓遺稿:一卷(清章韻清撰),別集
198,400,寫韻樓詩草:一卷(清吳瓊仙撰),別集
330,402,崦樓遺稿:二卷(清沈鵲應撰),別集


In [236]:
df_10_percent = df_top_name.copy()
df_10_percent = df_10_percent[df_10_percent['worktype'] == '別集'].copy()

sample_30_from_10_percent = df_10_percent.sample(n=30)

# 查看结果
sample_30_from_10_percent = sample_30_from_10_percent.sort_values(by='index')

sample_30_from_10_percent

Unnamed: 0_level_0,index,TitleHZ,worktype
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
167,3,綠雲山房詩草:二卷(清勞蓉君撰),別集
352,6,壽筠簃詩稿:一卷(清陳麗芳撰),別集
466,29,"明秋館集:詩一卷, 詞一卷(清裘凌仙撰)",別集
277,39,三省樓剩稿:一卷(清張婉撰),別集
12,50,佩秋閣遺稿﹕二卷﹐詞稿﹕一卷﹐文稿﹕一卷(清吳茝撰),別集
262,111,鄰雲友月之居詩初稿:四卷(清張紈英撰),別集
480,116,繡餘草:一卷(清陶先畹撰),別集
436,125,彝罌詞:一卷(清溫匋撰),別集
49,134,冷紅軒詩集﹕二卷﹐附詞(清百保友蘭撰),別集
153,140,話雨樓詩草: 三卷(清言忠貞撰）,別集


#### 1.6.4.2 对数归一化结果计算

In [20]:
ji_combine_log = ji_combine_log_normalize.copy()

ji_combine_log['totalWeight'] = xuweight*ji_combine_log['normalized_xucount'] + baweight*ji_combine_log['normalized_bacount'] + ticiweight*ji_combine_log['normalized_ticicount'] + includedweight*ji_combine_log['includedcount']
ji_combine_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,normalized_bacount,normalized_ticicount,normalized_xucount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,0.0,3.0,1.0,0.808069,0.144783,0.000000,0.289566,0.310604
10,2.0,2.0,0.0,0.703667,0.000000,0.229476,0.229476,0.290655
11,0.0,3.0,0.0,0.577484,0.000000,0.000000,0.289566,0.216762
12,1.0,5.0,1.0,0.567748,0.144783,0.144783,0.374258,0.307893
13,0.0,1.0,0.0,0.648524,0.000000,0.000000,0.144783,0.198327
...,...,...,...,...,...,...,...,...
478,0.0,0.0,0.0,0.625005,0.000000,0.000000,0.000000,0.156251
479,1.0,4.0,1.0,0.672715,0.144783,0.144783,0.336176,0.324614
480,3.0,5.0,0.0,0.517897,0.000000,0.289566,0.374258,0.295430
481,0.0,0.0,0.0,0.795417,0.000000,0.000000,0.000000,0.198854


### 把集的册书也考虑进来呢

In [64]:
workCeSQL = '''
SELECT workID, NumberCe
FROM work 
ORDER BY NumberCe DESC
'''
workCe = duckdb.query(workCeSQL).df()
workCe.set_index('workID', inplace=True)

workCe

Unnamed: 0_level_0,NumberCe
workID,Unnamed: 1_level_1
460,24.0
61,20.0
195,20.0
88,18.0
193,16.0
...,...
211,
274,
344,
363,


In [65]:
jiRawDataWithCe = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [jiRawData,workCe]).fillna(0)
jiRawDataWithCe

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,NumberCe
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.0,3.0,1.0,986.0,6.0
10,2.0,2.0,0.0,404.0,4.0
11,0.0,3.0,0.0,137.0,3.0
12,1.0,5.0,1.0,126.0,2.0
13,0.0,1.0,0.0,252.0,2.0
...,...,...,...,...,...
478,0.0,0.0,0.0,206.0,1.0
479,1.0,4.0,1.0,310.0,1.0
480,3.0,5.0,0.0,82.0,1.0
481,0.0,0.0,0.0,885.0,2.0


In [66]:
ji_ce_log_normalize = jiRawDataWithCe.copy()

for column in ji_ce_log_normalize.columns:  
    if ji_ce_log_normalize[column].dtype in ['float64', 'int64']:
        # 对数据加1以避免对0取对数的问题
        ji_ce_log_normalize[column] = np.log(ji_ce_log_normalize[column] + 1)

ji_ce_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,NumberCe
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,1.386294,0.693147,6.894670,1.945910
10,1.098612,1.098612,0.000000,6.003887,1.609438
11,0.000000,1.386294,0.000000,4.927254,1.386294
12,0.693147,1.791759,0.693147,4.844187,1.098612
13,0.000000,0.693147,0.000000,5.533389,1.098612
...,...,...,...,...,...
478,0.000000,0.000000,0.000000,5.332719,0.693147
479,0.693147,1.609438,0.693147,5.739793,0.693147
480,1.386294,1.791759,0.000000,4.418841,0.693147
481,0.000000,0.000000,0.000000,6.786717,1.098612


In [67]:
xuweight1 = 0.2
baweight1 = 0.2
ticiweight1 = 0.2
includedweight1 = 0.2
ceWeight = 0.2

ji_ce_log = ji_ce_log_normalize.copy()

ji_ce_log['totalWeight'] = xuweight1*ji_ce_log['xucount'] + baweight1*ji_ce_log['bacount'] + ticiweight1*ji_ce_log['ticicount'] + includedweight1*ji_ce_log['includedcount'] + ceWeight*ji_ce_log['NumberCe']

ji_ce_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,NumberCe,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9,0.000000,1.386294,0.693147,6.894670,1.945910,2.184004
10,1.098612,1.098612,0.000000,6.003887,1.609438,1.962110
11,0.000000,1.386294,0.000000,4.927254,1.386294,1.539968
12,0.693147,1.791759,0.693147,4.844187,1.098612,1.824171
13,0.000000,0.693147,0.000000,5.533389,1.098612,1.465030
...,...,...,...,...,...,...
478,0.000000,0.000000,0.000000,5.332719,0.693147,1.205173
479,0.693147,1.609438,0.693147,5.739793,0.693147,1.885734
480,1.386294,1.791759,0.000000,4.418841,0.693147,1.658008
481,0.000000,0.000000,0.000000,6.786717,1.098612,1.577066


# 2. 作者（poet）的Importance：参与制作书的次数（包括被收录和参与编辑）、写像赞的次数、被写像赞的次数、收到唱和诗及书信的次数、被讨论的次数

## 2.1 参与制作书的次数

In [31]:
PoetParticipateWorkSQL = '''SELECT poetID, COUNT(DISTINCT workID) AS participate_count
    FROM workpoetlinks
    GROUP BY poetID
    ORDER BY participate_count DESC'''

PoetParticipateWork = duckdb.query(PoetParticipateWorkSQL).df()
PoetParticipateWork.set_index('poetID', inplace=True)

PoetParticipateWork

Unnamed: 0_level_0,participate_count
poetID,Unnamed: 1_level_1
263,13
308,9
199,9
167,7
92,7
...,...
7433,1
974,1
8445,1
8451,1


## 2.2 写像赞的次数

In [32]:
xiangzanauthorSQL = '''
SELECT poempoetlinks.poetID,COUNT(DISTINCT poem.poemID) AS xiangzanauthor_count
FROM poem,poempoetlinks
WHERE poem.GenreHZ='文﹕像贊' AND poem.poemID = poempoetlinks.poemID
GROUP BY poempoetlinks.poetID
ORDER BY xiangzanauthor_count
'''
xiangzanauthor = duckdb.query(xiangzanauthorSQL).df()
xiangzanauthor.set_index('poetID', inplace=True)
xiangzanauthor

Unnamed: 0_level_0,xiangzanauthor_count
poetID,Unnamed: 1_level_1
6494,1
6430,1
4424,1
1977,1
7279,1
6482,1
244,1
7006,1
3813,1
225,2


## 2.3 被写像赞的次数

In [33]:
bexiangzanSQL = '''
SELECT poetassubjectID AS poetID,COUNT(DISTINCT poemID) AS bexiangzansubjectCount
FROM poem
WHERE poem.GenreHZ='文﹕像贊' AND poetassubjectID !=0
GROUP BY poetassubjectID
'''
bexiangzan = duckdb.query(bexiangzanSQL).df()
bexiangzan.set_index('poetID', inplace=True)

bexiangzan

Unnamed: 0_level_0,bexiangzansubjectCount
poetID,Unnamed: 1_level_1
4939,1
5,1


## 2.4 被讨论的次数

In [34]:
discussedSQL = '''
SELECT poetassubjectID AS poetID, COUNT(DISTINCT poemID) AS discussedCount
FROM poem
WHERE GenreHZ!='文﹕像贊' AND poetassubjectID != 0
GROUP BY poetassubjectID
'''
discussed = duckdb.query(discussedSQL).df()
discussed.set_index('poetID', inplace=True)

discussed

Unnamed: 0_level_0,discussedCount
poetID,Unnamed: 1_level_1
2793,5
2232,2
2971,5
1383,3
3571,2
...,...
4789,1
5073,1
5400,1
5497,1


In [49]:
discussedSQL = '''
SELECT poetassubjectID AS poetID, COUNT(DISTINCT poemID) AS discussedCount
FROM poem
WHERE GenreHZ IN ('文﹕略傳','文﹕詩話','文﹕傳') AND poetassubjectID != 0
GROUP BY poetassubjectID
'''
discussed = duckdb.query(discussedSQL).df()
discussed.set_index('poetID', inplace=True)

discussed

Unnamed: 0_level_0,discussedCount
poetID,Unnamed: 1_level_1
2099,3
2191,4
984,4
3349,1
3424,1
...,...
5504,1
7932,1
322,1
7874,1


## 2.5 收到唱和书信的次数

In [35]:
changheshiSQL = '''
SELECT changheshuxinpoetID AS poetID, COUNT(DISTINCT poemID) AS changheshiCount
FROM poem
WHERE changheshuxinpoetID !=0
GROUP BY changheshuxinpoetID
'''
changheshi = duckdb.query(changheshiSQL).df()
changheshi.set_index('poetID', inplace=True)

changheshi

Unnamed: 0_level_0,changheshiCount
poetID,Unnamed: 1_level_1
124,5
2966,1
251,11
3929,4
1544,22
...,...
3482,1
4615,1
6449,1
8241,1


## 2.6 初始数据汇总

In [50]:
poetRawData = reduce(lambda left, right: pd.merge(left, right, on='poetID', how='outer'), [PoetParticipateWork, xiangzanauthor, bexiangzan,discussed, changheshi]).fillna(0)
poetRawData

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1.0,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,0.0
12,2.0,0.0,0.0,3.0,0.0
...,...,...,...,...,...
8457,1.0,0.0,0.0,0.0,0.0
8458,1.0,0.0,0.0,0.0,0.0
8459,1.0,0.0,0.0,0.0,0.0
8460,1.0,0.0,0.0,0.0,0.0


## 2.7 归一化

### 2.7.1 Min-Max归一化

In [51]:
poet_min_max_normalize = poetRawData.copy()
for column in poet_min_max_normalize.columns:
    if poet_min_max_normalize[column].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        poet_min_max_normalize[column] = min_max_normalize(poet_min_max_normalize[column])
poet_min_max_normalize

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,7.692308,0.0,100.0,0.000000,0.0
6,7.692308,0.0,0.0,0.000000,0.0
8,7.692308,0.0,0.0,7.692308,0.0
11,7.692308,0.0,0.0,0.000000,0.0
12,15.384615,0.0,0.0,23.076923,0.0
...,...,...,...,...,...
8457,7.692308,0.0,0.0,0.000000,0.0
8458,7.692308,0.0,0.0,0.000000,0.0
8459,7.692308,0.0,0.0,0.000000,0.0
8460,7.692308,0.0,0.0,0.000000,0.0


### 2.7.2 对数归一化

In [52]:
poet_log_normalize = poetRawData.copy()

for column in poet_log_normalize.columns:  
    if poet_log_normalize[column].dtype in ['float64', 'int64']:
        # 对数据加1以避免对0取对数的问题
        max_val = poet_log_normalize[column].max()
        if max_val > 0:
            # 应用对数归一化
            poet_log_normalize[column] = np.log(poet_log_normalize[column] + 1) / np.log(max_val + 1)
poet_log_normalize

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,0.26265,0.0,1.0,0.000000,0.0
6,0.26265,0.0,0.0,0.000000,0.0
8,0.26265,0.0,0.0,0.262650,0.0
11,0.26265,0.0,0.0,0.000000,0.0
12,0.41629,0.0,0.0,0.525299,0.0
...,...,...,...,...,...
8457,0.26265,0.0,0.0,0.000000,0.0
8458,0.26265,0.0,0.0,0.000000,0.0
8459,0.26265,0.0,0.0,0.000000,0.0
8460,0.26265,0.0,0.0,0.000000,0.0


## 2.8 重要性计算

### 2.8.1 权重设置
poet4258姚倚雲 写了很多唱和诗给丈夫 poet7909范當世，导致范當世收到唱和书信的次数很高，需要降低该项权重

In [53]:
participateWeight = 0.5
writeXZWeight = 0
inXZWeight = 0
bediscussedWeight = 0.5
changheWeight = 0

### 2.8.2 Min-Max归一化结果计算

In [54]:
poet_min_max = poet_min_max_normalize.copy()

poet_min_max['totalWeight'] = participateWeight*poet_min_max['participate_count'] + writeXZWeight*poet_min_max['xiangzanauthor_count']+inXZWeight*poet_min_max['bexiangzansubjectCount']+bediscussedWeight*poet_min_max['discussedCount']+changheWeight*poet_min_max['changheshiCount']

poet_min_max

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,totalWeight
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,7.692308,0.0,100.0,0.000000,0.0,3.846154
6,7.692308,0.0,0.0,0.000000,0.0,3.846154
8,7.692308,0.0,0.0,7.692308,0.0,7.692308
11,7.692308,0.0,0.0,0.000000,0.0,3.846154
12,15.384615,0.0,0.0,23.076923,0.0,19.230769
...,...,...,...,...,...,...
8457,7.692308,0.0,0.0,0.000000,0.0,3.846154
8458,7.692308,0.0,0.0,0.000000,0.0,3.846154
8459,7.692308,0.0,0.0,0.000000,0.0,3.846154
8460,7.692308,0.0,0.0,0.000000,0.0,3.846154


### 2.8.3 对数归一化结果计算

In [55]:
poet_log = poet_log_normalize.copy()

poet_log['totalWeight'] = participateWeight*poet_log['participate_count'] + writeXZWeight*poet_log['xiangzanauthor_count']+inXZWeight*poet_log['bexiangzansubjectCount']+bediscussedWeight*poet_log['discussedCount']+changheWeight*poet_log['changheshiCount']

poet_log

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,totalWeight
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,0.26265,0.0,1.0,0.000000,0.0,0.131325
6,0.26265,0.0,0.0,0.000000,0.0,0.131325
8,0.26265,0.0,0.0,0.262650,0.0,0.262650
11,0.26265,0.0,0.0,0.000000,0.0,0.131325
12,0.41629,0.0,0.0,0.525299,0.0,0.470794
...,...,...,...,...,...,...
8457,0.26265,0.0,0.0,0.000000,0.0,0.131325
8458,0.26265,0.0,0.0,0.000000,0.0,0.131325
8459,0.26265,0.0,0.0,0.000000,0.0,0.131325
8460,0.26265,0.0,0.0,0.000000,0.0,0.131325


In [56]:
poetregionSQL = '''
SELECT poet.*, poetregionlinks.regionID
FROM poet
LEFT JOIN poetregionlinks ON poet.poetID = poetregionlinks.poetID
'''
poetregion = duckdb.query(poetregionSQL).df()
poetregion

Unnamed: 0,poetID,NameHZ,NamePY,HaoHZ,HaoPY,ZiHZ,ZiPY,OtherZiHaoHZ,OtherZiHaoPY,MaritalStatus,...,StartYear,EndYear,EthnicGroup,MainWorks,ispoet,HuWenKai,zhuLu,xuZuoZhe,baZuoZhe,regionID
0,5,甘立媃,Gan Lirou,,,如玉,Ruyu,,,正室﹐寡婦,...,1743,1819,漢,"《咏雪樓稿 : 5卷, 卷首1卷, 附1卷》",1,267,江西通志,宋鎔﹐王若閎序﹐甘立媃自序﹐劉彬士﹑顧皋作墓志詺﹐茹棻作像贊,徐心田跋,31.0
1,6,劉慧娟,Liu Huijuan,幻花女史,Huanhuanüshi,湘舲,Xiangling,,,正室﹐寡婦,...,1830,1880,漢,《曇花閣詩鈔四卷》,1,719,廣東女子藝文考,戴鴻慈﹐梁煦南序﹐劉慧娟自序,,189.0
2,8,馮思慧,Feng Sihui,,,睿之,Ruizhi,駱思慧,Luo Sihui,正室,...,1748,1774,漢,《繡餘吟六卷附錄一卷》,1,654,山西通志﹐正始續集﹐明媛詩話﹐擷芳集,劉秉恬序,,50.0
3,11,屠鏡心,Tu Jingxin,掃花主人,Saohuazhuren,,,,,正室,...,1796,1860,漢,《玩月軒詩草》《爨餘吟二卷》,1,627,清代閨閣詩人徵略﹐吳氏小殘卷齋數目,張之縉﹐文廉﹐耀曾序,"任珮瑛,萬秉,任凱,任治,任謙吉跋",140.0
4,12,鄭蘭孫,Zheng Lansun,蘅洲,Hengzhou,娛清,Yuqing,,,正室,...,1814,1861,漢,《蓮因室詩集二卷詞集一卷》﹐《都梁香閣詩詞集》,1,744,杭州府志﹐小檀欒室彙刻百家閨秀詞﹐小黛軒論詩詩﹐閨秀詞話,徐鴻謨﹐錢士杓序﹐鄭蘭孫自序﹐俞繡孫﹐孫因培﹐張煒﹐顧琇瑩﹐鍾維則﹐秦緗業﹐楊昌濬﹐孫念培﹐...,許樾身跋﹐徐琪附記,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7746,8419,陳希文,Chen Xiwen,,,,,,,正室,...,1769,1848,漢,,1,,,,,
7747,8424,潘宗秇,Pan Zongyi,,,小江,Xiaojiang,,,,...,1736,1850,漢,,1,,,,,
7748,8425,陳邦泰,Chen Bangtai,,,,,,,,...,1736,1850,漢,,1,,,,,
7749,8430,程雲,Cheng Yun,,,頑石,Wanshi,,,,...,1796,1861,漢,,0,,,,,


In [43]:
import altair as alt
import pandas as pd

# alt.data_transformers.enable("vegafusion")

df = poet_min_max.copy()
df['poetID'] = df.index

if 'poetID' not in poetregion.columns:
    poet['poetID'] = poetregion.index

# 定义一个函数，计算给定 workID 的缺失值数量
def count_missing_values(poet_id):
    # 指定不检查的列
    columns_to_exclude = ['HuWenKai', 'xuZuoZhe', 'baZuoZhe']  # 替换为不需要检查的实际列名
    columns_to_exclude += [col for col in poetregion.columns if 'PY' in col]
    
     # 获取除去排除列的所有列
    columns_to_check = [col for col in poetregion.columns if col not in columns_to_exclude]
    
    print(len(columns_to_check) )

    # 查找对应的 workID 行，并计算除去排除列后的缺失值数目
    poet_row = poetregion[poetregion['poetID'] == poet_id]
    if poet_row.empty:
        return len(columns_to_check) 
    
   
    # print(columns_to_check)
    return poet_row[columns_to_check].isnull().sum(axis=1).values[0]



# 计算缺失值数量，并将其添加到 df 中
df['missing_count'] = df['poetID'].apply(count_missing_values)
# print(df)


# 按 totalWeight 排序
df = df.sort_values(by='totalWeight', ascending=False)

df = df.reset_index(drop=True)  # 重置索引并丢弃原来的索引
df['poetID'] = df['poetID'].astype(str) + "_" + (df.index + 1).astype(str)

# 提取 missing_count 列，用于创建柱状图
# missing_counts = df[['poetID', 'missing_count']][:500]
missing_counts = df[['poetID', 'missing_count']]

# 移除 totalWeight 列，确保不在图中显示
# df = df.drop(columns=['totalWeight'])[:500]
df = df.drop(columns=['totalWeight'])

# 使用 melt 转换数据格式
df_melted = df.melt(id_vars=['poetID'], var_name='category', value_name='value')

# 创建堆叠的 Streamgraph 图层
streamgraph = alt.Chart(df_melted).mark_area().encode(
    x=alt.X('poetID:O', sort=None, axis=alt.Axis(title='Poet ID', labelAngle=45, labelFontSize=10)),
    y=alt.Y('value:Q', stack='zero', axis=alt.Axis(title='Value')),  # 正常堆叠
    color=alt.Color('category:N', legend=alt.Legend(title="Category", orient="left")),
    tooltip=['poetID:O', 'category:N', 'value:Q']  # 添加 tooltip 显示详细信息
).properties(
    width=3600,
    height=300
)

# 创建柱状图图层显示 missing_count
missing_bar = alt.Chart(missing_counts).mark_bar(color='red', opacity=0.6).encode(
    x=alt.X('poetID:O', sort=None, axis=alt.Axis(title='Poet ID', labelAngle=45, labelFontSize=10, labels=True)),
    y=alt.Y('missing_count:Q', axis=alt.Axis(title='Missing Count')),
    tooltip=['poetID:O', 'missing_count:Q']
)

# 将两层图表合并
chart = alt.layer(streamgraph, missing_bar).resolve_scale(
    y='independent'  # 独立的 Y 轴，以便展示两个不同的度量
).properties(
    title="Streamgraph with Missing Count across Poet IDs (Sorted by Total Weight)"
)

# chart

16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
1

In [57]:
df

Unnamed: 0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,poetID,missing_count
0,23.076923,0.0,0.0,100.000000,27.551020,1917_1,1
1,23.076923,0.0,0.0,88.311688,37.755102,99_2,1
2,23.076923,0.0,0.0,87.012987,2.040816,207_3,3
3,30.769231,0.0,0.0,74.025974,1.020408,911_4,1
4,100.000000,0.0,0.0,1.298701,14.285714,263_5,2
...,...,...,...,...,...,...,...
7129,0.000000,0.0,0.0,0.000000,4.081633,8302_7130,16
7130,0.000000,0.0,0.0,0.000000,11.224490,8304_7131,16
7131,0.000000,0.0,0.0,0.000000,4.081633,8305_7132,16
7132,0.000000,0.0,0.0,0.000000,4.081633,8306_7133,16


In [58]:
# 获取前100行数据
# df_top_name = df.copy()[:25]

df_top_name = df.copy()

# 将索引 'workID' 重置为列（便于与 work 表合并）
df_top_name = df_top_name.reset_index()
df_top_name['poetID'] = df_top_name['poetID'].str.split('_').str[0]
df_top_name['poetID'] = df_top_name['poetID'].astype(str)
poet['poetID'] = poet['poetID'].astype(str)

# 将 df_top_name 和 work 表按 'workID' 合并，添加 TitleHZ 列
df_top_name = df_top_name.merge(poet[['poetID', 'NameHZ','Sex']], on='poetID', how='left')

# 删除不再需要的索引列
df_top_name = df_top_name.set_index('poetID')
df_top_name = df_top_name[['index','NameHZ','Sex']].copy()
df_top_name

Unnamed: 0_level_0,index,NameHZ,Sex
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1917,0,張䌌英,女
99,1,鮑之蕙,女
207,2,凌祉媛,女
911,3,顧太清,女
263,4,袁枚,男
...,...,...,...
8302,7129,,
8304,7130,,
8305,7131,,
8306,7132,,


In [46]:
df_top_10_percent = df_top_name.head(int(len(df) * 0.1)).copy()
df_top_10_percent = df_top_10_percent[df_top_10_percent['Sex'] == '女'].copy()


# 从前 10% 的数据中随机抽取 30 条
sample_30_from_top_10_percent = df_top_10_percent.sample(n=30, random_state=40)

# 查看结果
sample_30_from_top_10_percent = sample_30_from_top_10_percent.sort_values(by='index')

sample_30_from_top_10_percent

Unnamed: 0_level_0,index,NameHZ,Sex
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
167,10,歸懋儀,女
165,31,王倩,女
132,62,張襄,女
229,65,鄧瑜,女
177,76,翁端恩,女
5807,103,許禧身,女
922,188,陸費湘于,女
87,193,許燕珍,女
429,202,陳芸,女
80,203,高景芳,女


In [47]:
df_bottom_10_percent = df_top_name.tail(int(len(df) * 0.1)).copy()
df_bottom_10_percent = df_bottom_10_percent[df_bottom_10_percent['Sex'] == '女'].copy()

# 从前 10% 的数据中随机抽取 30 条
sample_30_from_bottom_10_percent = df_bottom_10_percent.sample(n=30, random_state=42)

# 查看结果
sample_30_from_bottom_10_percent = sample_30_from_bottom_10_percent.sort_values(by='index')

sample_30_from_bottom_10_percent

Unnamed: 0_level_0,index,NameHZ,Sex
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5520,6427,李慶英,女
5487,6431,蔣愛,女
5558,6477,寇白,女
5526,6492,阮月卿,女
5533,6498,周祐,女
5536,6500,范隆坤,女
5539,6503,朱淑姬,女
5544,6508,章韻先,女
4658,6554,紫微王夫人,女
2744,6559,湘妃,女


In [48]:
df_10_percent = df_top_name.copy()
df_10_percent = df_10_percent[df_10_percent['Sex'] == '女'].copy()

# 从前 10% 的数据中随机抽取 30 条
sample_30_from_10_percent = df_10_percent.sample(n=30,random_state=42)

# 查看结果
sample_30_from_10_percent = sample_30_from_10_percent.sort_values(by='index')

sample_30_from_10_percent

Unnamed: 0_level_0,index,NameHZ,Sex
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5936,108,王采蘋,女
235,132,錢孟鈿,女
80,203,高景芳,女
1745,406,張淑蓮,女
249,527,李道清,女
929,572,潘煥吉,女
2962,657,景翩翩,女
6832,2383,王玲,女
7502,2476,潘本溫,女
2200,3023,鄭奎妻,女


In [None]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import requests

app = dash.Dash(__name__,suppress_callback_exceptions=True)

app.layout = html.Div([
    html.Div([
        dcc.Graph(id='work-distribution-chart', style={'height': '95vh', 'width': '100vw'}),
        html.Div(id='work-name-output')
    ], style={'height': '100vh'}),
    html.Div([
        dcc.Graph(id='poet-distribution-chart', style={'height': '95vh', 'width': '100vw'}),
        html.Div(id='poet-info-output')
    ], style={'height': '100vh'})
])

ji_log_reset = ji_log.reset_index()
poet_log_reset = poet_log.reset_index()

jidata = ji_log['totalWeight'].value_counts(sort=True,ascending=True).reset_index().sort_values(by='totalWeight', ascending=False)
poetdata = poet_log['totalWeight'].value_counts(sort=True,ascending=True).reset_index().sort_values(by='totalWeight', ascending=False)


# API 调用函数
def fetch_work_from_api(id):
    response = requests.get(f"http://localhost:8000/work/{id}")
    print(response)
    if response.status_code == 200 and len(response.json())>0:
        return str(response.json()[0]['workID']) + str(response.json()[0]['TitleHZ'])
    else:
        return f"Work{id} not found"
    
def fetch_poet_from_api(id):
    response = requests.get(f"http://localhost:8000/poet/{id}")
    print(response)
    if response.status_code == 200 and len(response.json())>0:
        return str(response.json()[0]['poetID']) + str(response.json()[0]['NameHZ'])
    else:
        return f"Poet{id} not found"

# 更新显示的数据
@app.callback(
    Output('work-distribution-chart', 'figure'),
    Output('work-name-output', 'children'),
    Input('work-distribution-chart', 'hoverData')
)
def update_graph_and_display_name(hoverData):
    # 绘制 Total 值的分布
    fig = px.line(jidata, x='totalWeight', y='count', text='count', title="Distribution of work Total Values")
   
    work_name = "Hover over a bar to see the work's name."
    if hoverData:
        total_value = float(hoverData['points'][0]['x'])
        print(total_value)
        ids = ji_log_reset[ji_log_reset['totalWeight'] == total_value]['workID'].tolist()
        work_names = "｜ ".join([fetch_work_from_api(id) for id in ids])
        
        work_name = f"work: {work_names}"
        fig.update_layout(annotations=[dict(x=total_value, y=hoverData['points'][0]['y'],
                                                text=work_names, showarrow=True, arrowhead=1)])
    return fig, work_name

@app.callback(
    Output('poet-distribution-chart', 'figure'),
    Output('poet-info-output', 'children'),
    Input('poet-distribution-chart', 'hoverData')
)

def update_poet_graph(hoverData):
    fig = px.line(poetdata, x='totalWeight', y='count', text='count', title="Distribution of Poet Weights")
    
    poet_name = "Hover over a bar to see poet details."
    if hoverData:
        total_value = float(hoverData['points'][0]['x'])
        ids = poet_log_reset[poet_log_reset['totalWeight'] == total_value]['poetID'].tolist()
        poet_names = "｜".join([fetch_poet_from_api(id) for id in ids])
        
        poet_name = f"Poet: {poet_names}"
        fig.update_layout(annotations=[dict(x=total_value, y=hoverData['points'][0]['y'],
                                                text=poet_names, showarrow=True, arrowhead=1)])
    return fig, poet_name

# 运行应用
if __name__ == '__main__':
    app.run_server(debug=True)

In [136]:
poetregionSQL = '''
SELECT poet.*, poetregionlinks.regionID
FROM poet
LEFT JOIN poetregionlinks ON poet.poetID = poetregionlinks.poetID
'''
poetregion = duckdb.query(poetregionSQL).df()
poetregion

Unnamed: 0,poetID,NameHZ,NamePY,HaoHZ,HaoPY,ZiHZ,ZiPY,OtherZiHaoHZ,OtherZiHaoPY,MaritalStatus,...,StartYear,EndYear,EthnicGroup,MainWorks,ispoet,HuWenKai,zhuLu,xuZuoZhe,baZuoZhe,regionID
0,5,甘立媃,Gan Lirou,,,如玉,Ruyu,,,正室﹐寡婦,...,1743,1819,漢,"《咏雪樓稿 : 5卷, 卷首1卷, 附1卷》",1,267,江西通志,宋鎔﹐王若閎序﹐甘立媃自序﹐劉彬士﹑顧皋作墓志詺﹐茹棻作像贊,徐心田跋,31.0
1,6,劉慧娟,Liu Huijuan,幻花女史,Huanhuanüshi,湘舲,Xiangling,,,正室﹐寡婦,...,1830,1880,漢,《曇花閣詩鈔四卷》,1,719,廣東女子藝文考,戴鴻慈﹐梁煦南序﹐劉慧娟自序,,189.0
2,8,馮思慧,Feng Sihui,,,睿之,Ruizhi,駱思慧,Luo Sihui,正室,...,1748,1774,漢,《繡餘吟六卷附錄一卷》,1,654,山西通志﹐正始續集﹐明媛詩話﹐擷芳集,劉秉恬序,,50.0
3,11,屠鏡心,Tu Jingxin,掃花主人,Saohuazhuren,,,,,正室,...,1796,1860,漢,《玩月軒詩草》《爨餘吟二卷》,1,627,清代閨閣詩人徵略﹐吳氏小殘卷齋數目,張之縉﹐文廉﹐耀曾序,"任珮瑛,萬秉,任凱,任治,任謙吉跋",140.0
4,12,鄭蘭孫,Zheng Lansun,蘅洲,Hengzhou,娛清,Yuqing,,,正室,...,1814,1861,漢,《蓮因室詩集二卷詞集一卷》﹐《都梁香閣詩詞集》,1,744,杭州府志﹐小檀欒室彙刻百家閨秀詞﹐小黛軒論詩詩﹐閨秀詞話,徐鴻謨﹐錢士杓序﹐鄭蘭孫自序﹐俞繡孫﹐孫因培﹐張煒﹐顧琇瑩﹐鍾維則﹐秦緗業﹐楊昌濬﹐孫念培﹐...,許樾身跋﹐徐琪附記,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7746,8419,陳希文,Chen Xiwen,,,,,,,正室,...,1769,1848,漢,,1,,,,,
7747,8424,潘宗秇,Pan Zongyi,,,小江,Xiaojiang,,,,...,1736,1850,漢,,1,,,,,
7748,8425,陳邦泰,Chen Bangtai,,,,,,,,...,1736,1850,漢,,1,,,,,
7749,8430,程雲,Cheng Yun,,,頑石,Wanshi,,,,...,1796,1861,漢,,0,,,,,


In [81]:
columns_to_display = [col for col in poetregion.columns if 'PY' not in col]

alt.renderers.enable('default')
alt.data_transformers.disable_max_rows()

poet_long = poetregion.reset_index().melt(id_vars='poetID', value_vars=columns_to_display)
poet_long['non_null'] = poet_long['value'].notna()
poet_long['value_or_null'] = poet_long['value'].fillna('Empty') 

chart = alt.Chart(poet_long).mark_rect().encode(
    x=alt.X('variable:N', title='Column'),
    y=alt.Y('poetID:O', title='PoetID'),
    color=alt.Color('non_null:N', legend=None, scale=alt.Scale(domain=[True, False], range=['green', 'red'])),
    tooltip=[alt.Tooltip('poetID:N', title='Poet ID'), alt.Tooltip('variable:N', title='Column'), alt.Tooltip('value_or_null:N', title='Value')]
).properties(
    width=2000,
    height=30000
).configure_axis(
    grid=False
)

# 显示图表
chart.save('poetwithregion.html')

In [82]:
columns_to_display = [col for col in work.columns if 'PY' not in col]

alt.renderers.enable('default')
alt.data_transformers.disable_max_rows()

work_long = work.reset_index().melt(id_vars='workID', value_vars=columns_to_display)
work_long['non_null'] = work_long['value'].notna()
work_long['value_or_null'] = work_long['value'].fillna('Empty') 

chart = alt.Chart(work_long).mark_rect().encode(
    x=alt.X('variable:N', title='Column'),
    y=alt.Y('workID:O', title='WorkID'),
    color=alt.Color('non_null:N', legend=None, scale=alt.Scale(domain=[True, False], range=['green', 'red'])),
    tooltip=[alt.Tooltip('workID:N', title='WorkID'), alt.Tooltip('variable:N', title='Column'), alt.Tooltip('value_or_null:N', title='Value')]
).properties(
    width=2000,
    height=5000
).configure_axis(
    grid=False
)

# 显示图表
chart.save('work.html')

In [83]:
columns_to_display = [col for col in poem.columns if 'PY' not in col]

alt.renderers.enable('default')
alt.data_transformers.disable_max_rows()

poem_long = poem.reset_index().melt(id_vars='poemID',value_vars=columns_to_display)
poem_long['non_null'] = poem_long['value'].notna()
poem_long['value_or_null'] = poem_long['value'].fillna('Empty') 

chart = alt.Chart(poem_long).mark_rect().encode(
    x=alt.X('variable:N', title='Column'),
    y=alt.Y('poemID:O', title='PoemID'),
    color=alt.Color('non_null:N', legend=None, scale=alt.Scale(domain=[True, False], range=['green', 'red'])),
    tooltip=[alt.Tooltip('poemID:N', title='PoemID'), alt.Tooltip('variable:N', title='Column'), alt.Tooltip('value_or_null:N', title='Value')]
).properties(
    width=2000,
    height=30000
).configure_axis(
    grid=False
)

# 显示图表
chart.save('poem.html')

# 品（poem）重要性：所在集（work）的重要性、作者（poet）的重要性

In [131]:
workWeight = 0.5
poetWeight = 0.5

poemWorkPoetSQL = '''
SELECT poem.poemID, workpoemlinks.workID,poempoetlinks.poetID
FROM poem
LEFT JOIN workpoemlinks ON poem.poemID = workpoemlinks.poemID
LEFT JOIN poempoetlinks ON poem.poemID = poempoetlinks.poemID
'''

poemWorkPoet = duckdb.query(poemWorkPoetSQL).df()
# poemWorkPoet.set_index('poemID', inplace=True)
poemWorkPoet

Unnamed: 0,poemID,workID,poetID
0,1176,38.0,213.0
1,778,38.0,922.0
2,1175,38.0,213.0
3,6,38.0,494.0
4,7,38.0,495.0
...,...,...,...
92602,3314,,5.0
92603,44503,,5583.0
92604,52397,,6038.0
92605,52407,,6038.0


In [132]:
poemWorkPoet['workID'] = poemWorkPoet['workID'].fillna(0).astype(int)
# poemWorkPoet['workID'] = poemWorkPoet['workID'].astype(str)
# ji_combine.index = ji_combine.index.astype(str)
weight_mapping = ji_min_max['totalWeight'].reindex(poemWorkPoet['workID']).reset_index(drop=True)

poemWorkPoet = pd.concat([poemWorkPoet.reset_index(drop=True), weight_mapping.rename('workImportance')], axis=1)


poemWorkPoet

Unnamed: 0,poemID,workID,poetID,workImportance
0,1176,38,213.0,26.618661
1,778,38,922.0,26.618661
2,1175,38,213.0,26.618661
3,6,38,494.0,26.618661
4,7,38,495.0,26.618661
...,...,...,...,...
92602,3314,0,5.0,
92603,44503,0,5583.0,
92604,52397,0,6038.0,
92605,52407,0,6038.0,


In [133]:
poemWorkPoet['poetID'] = poemWorkPoet['poetID'].fillna(0).astype(int)
# poemWorkPoet['workID'] = poemWorkPoet['workID'].astype(str)
# ji_combine.index = ji_combine.index.astype(str)
weight_mapping = poet_log['totalWeight'].reindex(poemWorkPoet['poetID']).reset_index(drop=True)

poemWorkPoet = pd.concat([poemWorkPoet.reset_index(drop=True), weight_mapping.rename('poetImportance')], axis=1)


poemWorkPoet

Unnamed: 0,poemID,workID,poetID,workImportance,poetImportance
0,1176,38,213,26.618661,0.350102
1,778,38,922,26.618661,0.198336
2,1175,38,213,26.618661,0.350102
3,6,38,494,26.618661,0.031820
4,7,38,495,26.618661,0.031820
...,...,...,...,...,...
92602,3314,0,5,,0.373240
92603,44503,0,5583,,0.105060
92604,52397,0,6038,,0.120144
92605,52407,0,6038,,0.120144


In [134]:
workWeight = 0.5
poetWeight = 0.5

In [135]:
poetImportance = poemWorkPoet.copy()

poetImportance['totalWeight'] = workWeight * poetImportance['workImportance'] + poetWeight * poetImportance['poetImportance']

poetImportance

Unnamed: 0,poemID,workID,poetID,workImportance,poetImportance,totalWeight
0,1176,38,213,26.618661,0.350102,13.484382
1,778,38,922,26.618661,0.198336,13.408498
2,1175,38,213,26.618661,0.350102,13.484382
3,6,38,494,26.618661,0.031820,13.325241
4,7,38,495,26.618661,0.031820,13.325241
...,...,...,...,...,...,...
92602,3314,0,5,,0.373240,
92603,44503,0,5583,,0.105060,
92604,52397,0,6038,,0.120144,
92605,52407,0,6038,,0.120144,


In [250]:
import altair as alt
import pandas as pd

# alt.data_transformers.enable("vegafusion")

df = poetImportance.copy()
df['poemID'] = df.index

if 'poemID' not in poem.columns:
    poem['poemID'] = poem.index

# 定义一个函数，计算给定 workID 的缺失值数量
def count_missing_values(poem_id):
    # 指定不检查的列
    columns_to_exclude = ['TunePatternSubtitle', 'TunePatternSubtitlePY', 'Refs']  # 替换为不需要检查的实际列名
    columns_to_exclude += [col for col in poem.columns if 'PY' in col]
    
    columns_to_check = [col for col in poem.columns if col not in columns_to_exclude]
    print(len(columns_to_check))

    # 查找对应的 workID 行，并计算除去排除列后的缺失值数目
    poem_row = poem[poem['poemID'] == poem_id]
    
    if poem_row.empty:
        return len(columns_to_check)
    
    # 获取除去排除列的所有列
    
    
    return poem_row[columns_to_check].isnull().sum(axis=1).values[0]



# 计算缺失值数量，并将其添加到 df 中
df['missing_count'] = df['poemID'].apply(count_missing_values)



# 按 totalWeight 排序
df = df.sort_values(by='totalWeight', ascending=False)

df = df.reset_index(drop=True)  # 重置索引并丢弃原来的索引
df['poemID'] = df['poemID'].astype(str) + "_" + (df.index + 1).astype(str)

# 提取 missing_count 列，用于创建柱状图
missing_counts = df[['poemID', 'missing_count']][:500]


# 移除 totalWeight 列，确保不在图中显示
df = df.drop(columns=['totalWeight','poetID','workID'])[:500]

# 使用 melt 转换数据格式
df_melted = df.melt(id_vars=['poemID'], var_name='category', value_name='value')

# 创建堆叠的 Streamgraph 图层
streamgraph = alt.Chart(df_melted).mark_area().encode(
    x=alt.X('poemID:O', sort=None, axis=alt.Axis(title='Poem ID', labelAngle=45, labelFontSize=10)),
    y=alt.Y('value:Q', stack='zero', axis=alt.Axis(title='Value')),  # 正常堆叠
    color=alt.Color('category:N', legend=alt.Legend(title="Category", orient="left")),
    tooltip=['poemID:O', 'category:N', 'value:Q']  # 添加 tooltip 显示详细信息
).properties(
    width=3600,
    height=300
)

# 创建柱状图图层显示 missing_count
missing_bar = alt.Chart(missing_counts).mark_bar(color='red', opacity=0.6).encode(
    x=alt.X('poemID:O', sort=None, axis=alt.Axis(title='Poem ID', labelAngle=45, labelFontSize=10, labels=True)),
    y=alt.Y('missing_count:Q', axis=alt.Axis(title='Missing Count')),
    tooltip=['poemID:O', 'missing_count:Q']
)

# 将两层图表合并
chart = alt.layer(streamgraph, missing_bar).resolve_scale(
    y='independent'  # 独立的 Y 轴，以便展示两个不同的度量
).properties(
    title="Streamgraph with Missing Count across Poem IDs (Sorted by Total Weight)"
)

chart

8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8


In [256]:
poemcount = workpoemlinks.groupby('workID')['poemID'].count()
poemcount

workID
9      986
10     404
11     137
12     126
13     252
      ... 
478    206
479    310
480     82
481    885
482    162
Name: poemID, Length: 432, dtype: int64