In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pandas as pd
import duckdb
import numpy as np
import altair as alt
import json
from urllib.request import urlopen
import time
import os
import plotly.express as px

In [2]:
poet = pd.read_csv('../../CCGIV/datasets/WomenWriting/poet.csv')
poem = pd.read_csv('../../CCGIV/datasets/WomenWriting/poem.csv')
poempoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/poempoetlinks.csv')
subwork = pd.read_csv('../../CCGIV/datasets/WomenWriting/subwork.csv')
subworkpoemlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/subworkpoemlinks.csv')
subworkpoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/subworkpoetlinks.csv')
work = pd.read_csv('../../CCGIV/datasets/WomenWriting/work.csv')
workpoemlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/workpoemlinks.csv')
workpoetlinks = pd.read_csv('../../CCGIV/datasets/WomenWriting/workpoetlinks.csv')

con = duckdb.connect()
con.register('poet', poet)
con.register('poem', poem)
con.register('poempoetlinks', poempoetlinks)
con.register('subwork', subwork)
con.register('subworkpoemlinks', subworkpoemlinks)
con.register('subworkpoetlinks', subworkpoetlinks)
con.register('work', work)
con.register('workpoemlinks', workpoemlinks)
con.register('workpoetlinks', workpoetlinks)

<duckdb.duckdb.DuckDBPyConnection at 0x13068ac30>

# 1. 集（work）的 Importance：题辞作者数量、序作者数量、跋作者数量、收录的品（poem）的数量

## 1.1 题辞作者、序作者、跋作者不去重

### 1.1.1 题辞作者数量

In [13]:
ticiSQL = '''
SELECT workID,count(DISTINCT poetID) as ticicount
FROM workpoetlinks 
WHERE role IN ('題辭')
Group By workID
ORDER BY ticicount DESC
'''
tici = duckdb.query(ticiSQL).df()
tici.set_index('workID', inplace=True)

tici

Unnamed: 0_level_0,ticicount
workID,Unnamed: 1_level_1
125,119
223,96
218,73
352,49
112,43
...,...
455,1
21,1
400,1
157,1


### 1.1.2 序作者数量

In [14]:
xuSQL = '''
SELECT workID,count(DISTINCT poetID) as xucount
FROM workpoetlinks 
WHERE role IN ('序作者')
Group By workID
ORDER BY xucount DESC
'''
xu = duckdb.query(xuSQL).df()
xu.set_index('workID', inplace=True)
xu

Unnamed: 0_level_0,xucount
workID,Unnamed: 1_level_1
158,17
167,14
57,9
118,9
283,8
...,...
108,1
375,1
190,1
471,1


### 1.1.3 跋作者数量

In [ ]:
baSQL = '''
SELECT workID,count(DISTINCT poetID) as bacount
FROM workpoetlinks 
WHERE role IN ('跋作者')
Group By workID
ORDER BY bacount DESC
'''
ba = duckdb.query(baSQL).df()
ba.set_index('workID', inplace=True)
ba

### 1.1.4 题辞、序、跋汇总

In [16]:
from functools import reduce

TiciXuBa = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [tici, xu, ba]).fillna(0)
TiciXuBa

Unnamed: 0_level_0,ticicount,xucount,bacount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,0.0,3.0,1.0
10,2.0,2.0,0.0
11,0.0,3.0,0.0
12,1.0,5.0,1.0
13,0.0,1.0,0.0
...,...,...,...
476,1.0,4.0,0.0
477,14.0,5.0,1.0
479,1.0,4.0,1.0
480,3.0,5.0,0.0


## 1.2 题辞作者、序作者、跋作者去重

问题：在同一个集里有多重角色的作者，应该计数到哪种角色中？比如既是题辞作者又是序作者，算到哪类？

## 1.3 集收录的作品数

In [17]:
includedWorkSQL = '''
SELECT workID,count(poemID) as includedcount
FROM workpoemlinks 
Group By workID
ORDER BY includedcount DESC
'''
includedWork = duckdb.query(includedWorkSQL).df()
includedWork.set_index('workID', inplace=True)

includedWork

Unnamed: 0_level_0,includedcount
workID,Unnamed: 1_level_1
61,5075
120,3771
38,2478
63,2145
88,2052
...,...
341,9
358,7
225,7
373,3


## 1.4 集的题辞作者数、序作者数、跋作者数、收录作品数汇总

In [18]:
jiRawData = reduce(lambda left, right: pd.merge(left, right, on='workID', how='outer'), [tici, xu, ba, includedWork]).fillna(0)
jiRawData

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.0,3.0,1.0,986.0
10,2.0,2.0,0.0,404.0
11,0.0,3.0,0.0,137.0
12,1.0,5.0,1.0,126.0
13,0.0,1.0,0.0,252.0
...,...,...,...,...
478,0.0,0.0,0.0,206.0
479,1.0,4.0,1.0,310.0
480,3.0,5.0,0.0,82.0
481,0.0,0.0,0.0,885.0


## 1.5 归一化

### 1.5.1 Min-Max归一化

In [19]:
def min_max_normalize(series):
    return (series - series.min()) / (series.max() - series.min()) * 100

In [27]:
ji_min_max_normalize = jiRawData.copy()
for column in ji_min_max_normalize.columns:
    if ji_min_max_normalize[column].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        ji_min_max_normalize[column] = min_max_normalize(ji_min_max_normalize[column])
ji_min_max_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.000000,17.647059,20.0,19.428571
10,1.680672,11.764706,0.0,7.960591
11,0.000000,17.647059,0.0,2.699507
12,0.840336,29.411765,20.0,2.482759
13,0.000000,5.882353,0.0,4.965517
...,...,...,...,...
478,0.000000,0.000000,0.0,4.059113
479,0.840336,23.529412,20.0,6.108374
480,2.521008,29.411765,0.0,1.615764
481,0.000000,0.000000,0.0,17.438424


### 1.5.2 对数归一化

In [28]:
ji_log_normalize = jiRawData.copy()

for column in ji_log_normalize.columns:  
    if ji_log_normalize[column].dtype in ['float64', 'int64']:
        # 对数据加1以避免对0取对数的问题
        ji_log_normalize[column] = np.log(ji_log_normalize[column] + 1)

ji_log_normalize

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.000000,1.386294,0.693147,6.894670
10,1.098612,1.098612,0.000000,6.003887
11,0.000000,1.386294,0.000000,4.927254
12,0.693147,1.791759,0.693147,4.844187
13,0.000000,0.693147,0.000000,5.533389
...,...,...,...,...
478,0.000000,0.000000,0.000000,5.332719
479,0.693147,1.609438,0.693147,5.739793
480,1.386294,1.791759,0.000000,4.418841
481,0.000000,0.000000,0.000000,6.786717


## 1.6 重要性计算

### 1.6.1 权重设置

In [69]:
xuweight = 0.3
baweight = 0.3
ticiweight = 0.3
includedweight = 0.1

### 1.6.2 Min-Max归一化的结果计算

In [70]:
ji_min_max = ji_min_max_normalize.copy()
ji_min_max['totalWeight'] = xuweight*ji_min_max['xucount'] + baweight*ji_min_max['bacount'] + ticiweight*ji_min_max['ticicount'] + includedweight*ji_min_max['includedcount']
ji_min_max

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,17.647059,20.0,19.428571,13.236975
10,1.680672,11.764706,0.0,7.960591,4.829673
11,0.000000,17.647059,0.0,2.699507,5.564068
12,0.840336,29.411765,20.0,2.482759,15.323906
13,0.000000,5.882353,0.0,4.965517,2.261258
...,...,...,...,...,...
478,0.000000,0.000000,0.0,4.059113,0.405911
479,0.840336,23.529412,20.0,6.108374,13.921762
480,2.521008,29.411765,0.0,1.615764,9.741408
481,0.000000,0.000000,0.0,17.438424,1.743842


### 1.6.3 对数归一化的结果计算

In [71]:
ji_log = ji_log_normalize.copy()

ji_log['totalWeight'] = xuweight*ji_log['xucount'] + baweight*ji_log['bacount'] + ticiweight*ji_log['ticicount'] + includedweight*ji_log['includedcount']
ji_log

Unnamed: 0_level_0,ticicount,xucount,bacount,includedcount,totalWeight
workID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.000000,1.386294,0.693147,6.894670,1.313299
10,1.098612,1.098612,0.000000,6.003887,1.259556
11,0.000000,1.386294,0.000000,4.927254,0.908614
12,0.693147,1.791759,0.693147,4.844187,1.437835
13,0.000000,0.693147,0.000000,5.533389,0.761283
...,...,...,...,...,...
478,0.000000,0.000000,0.000000,5.332719,0.533272
479,0.693147,1.609438,0.693147,5.739793,1.472699
480,1.386294,1.791759,0.000000,4.418841,1.395300
481,0.000000,0.000000,0.000000,6.786717,0.678672


# 2. 作者（poet）的Importance：参与制作书的次数（包括被收录和参与编辑）、写像赞的次数、被写像赞的次数、收到唱和诗及书信的次数、被讨论的次数

## 2.1 参与制作书的次数

In [39]:
PoetParticipateWorkSQL = '''SELECT poetID, COUNT(DISTINCT workID) AS participate_count
    FROM workpoetlinks
    GROUP BY poetID
    ORDER BY participate_count DESC'''

PoetParticipateWork = duckdb.query(PoetParticipateWorkSQL).df()
PoetParticipateWork.set_index('poetID', inplace=True)

PoetParticipateWork

Unnamed: 0_level_0,participate_count
poetID,Unnamed: 1_level_1
263,13
308,9
199,9
92,7
167,7
...,...
6155,1
6138,1
6063,1
6198,1


## 2.2 写像赞的次数

In [44]:
xiangzanauthorSQL = '''
SELECT poempoetlinks.poetID,COUNT(DISTINCT poem.poemID) AS xiangzanauthor_count
FROM poem,poempoetlinks
WHERE poem.GenreHZ='文﹕像贊' AND poem.poemID = poempoetlinks.poemID
GROUP BY poempoetlinks.poetID
ORDER BY xiangzanauthor_count
'''
xiangzanauthor = duckdb.query(xiangzanauthorSQL).df()
xiangzanauthor.set_index('poetID', inplace=True)
xiangzanauthor

Unnamed: 0_level_0,xiangzanauthor_count
poetID,Unnamed: 1_level_1
244,1
7006,1
3813,1
6482,1
6494,1
7279,1
4424,1
6430,1
1977,1
42,2


## 2.3 被写像赞的次数

In [52]:
bexiangzanSQL = '''
SELECT poetassubjectID AS poetID,COUNT(DISTINCT poemID) AS bexiangzansubjectCount
FROM poem
WHERE poem.GenreHZ='文﹕像贊' AND poetassubjectID !=0
GROUP BY poetassubjectID
'''
bexiangzan = duckdb.query(bexiangzanSQL).df()
bexiangzan.set_index('poetID', inplace=True)

bexiangzan

Unnamed: 0_level_0,bexiangzansubjectCount
poetID,Unnamed: 1_level_1
4939,1
5,1


## 2.4 被讨论的次数

In [53]:
discussedSQL = '''
SELECT poetassubjectID AS poetID, COUNT(DISTINCT poemID) AS discussedCount
FROM poem
WHERE GenreHZ!='文﹕像贊' AND poetassubjectID != 0
GROUP BY poetassubjectID
'''
discussed = duckdb.query(discussedSQL).df()
discussed.set_index('poetID', inplace=True)

discussed

Unnamed: 0_level_0,discussedCount
poetID,Unnamed: 1_level_1
2009,4
1380,14
2069,3
1392,2
3372,1
...,...
5894,1
7937,1
7969,1
7877,1


## 2.5 收到唱和书信的次数

In [56]:
changheshiSQL = '''
SELECT changheshuxinpoetID AS poetID, COUNT(DISTINCT poemID) AS changheshiCount
FROM poem
WHERE changheshuxinpoetID !=0
GROUP BY changheshuxinpoetID
'''
changheshi = duckdb.query(changheshiSQL).df()
changheshi.set_index('poetID', inplace=True)

changheshi

Unnamed: 0_level_0,changheshiCount
poetID,Unnamed: 1_level_1
1882,1
789,3
16,4
32,7
1380,55
...,...
1361,2
8427,1
8355,2
1097,1


## 2.6 初始数据汇总

In [59]:
poetRawData = reduce(lambda left, right: pd.merge(left, right, on='poetID', how='outer'), [PoetParticipateWork, xiangzanauthor, bexiangzan,discussed, changheshi]).fillna(0)
poetRawData

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1.0,0.0,1.0,38.0,0.0
6,1.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,2.0,0.0
11,1.0,0.0,0.0,13.0,0.0
12,2.0,0.0,0.0,28.0,0.0
...,...,...,...,...,...
8457,1.0,0.0,0.0,0.0,0.0
8458,1.0,0.0,0.0,0.0,0.0
8459,1.0,0.0,0.0,0.0,0.0
8460,1.0,0.0,0.0,0.0,0.0


## 2.7 归一化

### 2.7.1 Min-Max归一化

In [61]:
poet_min_max_normalize = poetRawData.copy()
for column in poet_min_max_normalize.columns:
    if poet_min_max_normalize[column].dtype in ['float64', 'int64']:  # 检查数据类型是否为数值类型
        poet_min_max_normalize[column] = min_max_normalize(poet_min_max_normalize[column])
poet_min_max_normalize

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,7.692308,0.0,100.0,49.350649,0.0
6,7.692308,0.0,0.0,0.000000,0.0
8,7.692308,0.0,0.0,2.597403,0.0
11,7.692308,0.0,0.0,16.883117,0.0
12,15.384615,0.0,0.0,36.363636,0.0
...,...,...,...,...,...
8457,7.692308,0.0,0.0,0.000000,0.0
8458,7.692308,0.0,0.0,0.000000,0.0
8459,7.692308,0.0,0.0,0.000000,0.0
8460,7.692308,0.0,0.0,0.000000,0.0


### 2.7.2 对数归一化

In [62]:
poet_log_normalize = poetRawData.copy()

for column in poet_log_normalize.columns:  
    if poet_log_normalize[column].dtype in ['float64', 'int64']:
        # 对数据加1以避免对0取对数的问题
        poet_log_normalize[column] = np.log(poet_log_normalize[column] + 1)

poet_log_normalize

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,0.693147,0.0,0.693147,3.663562,0.0
6,0.693147,0.0,0.000000,0.000000,0.0
8,0.693147,0.0,0.000000,1.098612,0.0
11,0.693147,0.0,0.000000,2.639057,0.0
12,1.098612,0.0,0.000000,3.367296,0.0
...,...,...,...,...,...
8457,0.693147,0.0,0.000000,0.000000,0.0
8458,0.693147,0.0,0.000000,0.000000,0.0
8459,0.693147,0.0,0.000000,0.000000,0.0
8460,0.693147,0.0,0.000000,0.000000,0.0


## 2.8 重要性计算

### 2.8.1 权重设置
poet4258姚倚雲 写了很多唱和诗给丈夫 poet7909范當世，导致范當世收到唱和书信的次数很高，需要降低该项权重

In [72]:
participateWeight = 0.1
writeXZWeight = 0.3
inXZWeight = 0.3
bediscussedWeight = 0.2
changheWeight = 0.1

### 2.8.2 Min-Max归一化结果计算

In [73]:
poet_min_max = poet_min_max_normalize.copy()

poet_min_max['totalWeight'] = participateWeight*poet_min_max['participate_count'] + writeXZWeight*poet_min_max['xiangzanauthor_count']+inXZWeight*poet_min_max['bexiangzansubjectCount']+bediscussedWeight*poet_min_max['discussedCount']+changheWeight*poet_min_max['changheshiCount']

poet_min_max

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,totalWeight
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,7.692308,0.0,100.0,49.350649,0.0,40.639361
6,7.692308,0.0,0.0,0.000000,0.0,0.769231
8,7.692308,0.0,0.0,2.597403,0.0,1.288711
11,7.692308,0.0,0.0,16.883117,0.0,4.145854
12,15.384615,0.0,0.0,36.363636,0.0,8.811189
...,...,...,...,...,...,...
8457,7.692308,0.0,0.0,0.000000,0.0,0.769231
8458,7.692308,0.0,0.0,0.000000,0.0,0.769231
8459,7.692308,0.0,0.0,0.000000,0.0,0.769231
8460,7.692308,0.0,0.0,0.000000,0.0,0.769231


### 2.8.3 对数归一化结果计算

In [74]:
poet_log = poet_log_normalize.copy()

poet_log['totalWeight'] = participateWeight*poet_log['participate_count'] + writeXZWeight*poet_log['xiangzanauthor_count']+inXZWeight*poet_log['bexiangzansubjectCount']+bediscussedWeight*poet_log['discussedCount']+changheWeight*poet_log['changheshiCount']

poet_log

Unnamed: 0_level_0,participate_count,xiangzanauthor_count,bexiangzansubjectCount,discussedCount,changheshiCount,totalWeight
poetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,0.693147,0.0,0.693147,3.663562,0.0,1.009971
6,0.693147,0.0,0.000000,0.000000,0.0,0.069315
8,0.693147,0.0,0.000000,1.098612,0.0,0.289037
11,0.693147,0.0,0.000000,2.639057,0.0,0.597126
12,1.098612,0.0,0.000000,3.367296,0.0,0.783320
...,...,...,...,...,...,...
8457,0.693147,0.0,0.000000,0.000000,0.0,0.069315
8458,0.693147,0.0,0.000000,0.000000,0.0,0.069315
8459,0.693147,0.0,0.000000,0.000000,0.0,0.069315
8460,0.693147,0.0,0.000000,0.000000,0.0,0.069315


In [81]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import requests

app = dash.Dash(__name__,suppress_callback_exceptions=True)

app.layout = html.Div([
    html.Div([
        dcc.Graph(id='work-distribution-chart', style={'height': '95vh', 'width': '100vw'}),
        html.Div(id='work-name-output')
    ], style={'height': '100vh'}),
    html.Div([
        dcc.Graph(id='poet-distribution-chart', style={'height': '95vh', 'width': '100vw'}),
        html.Div(id='poet-info-output')
    ], style={'height': '100vh'})
])

ji_log_reset = ji_log.reset_index()
poet_log_reset = poet_log.reset_index()

jidata = ji_log['totalWeight'].value_counts(sort=True,ascending=True).reset_index().sort_values(by='totalWeight', ascending=False)
poetdata = poet_log['totalWeight'].value_counts(sort=True,ascending=True).reset_index().sort_values(by='totalWeight', ascending=False)
# poetdata['totalWeight'] = normalize_data_to_100(poetdata['totalWeight'])


def normalize_data_to_100(series):
    return (series - series.min()) / (series.max() - series.min()) * 100


# API 调用函数
def fetch_work_from_api(id):
    response = requests.get(f"http://localhost:8000/work/{id}")
    print(response)
    if response.status_code == 200 and len(response.json())>0:
        return str(response.json()[0]['workID']) + str(response.json()[0]['TitleHZ'])
    else:
        return f"Work{id} not found"
    
def fetch_poet_from_api(id):
    response = requests.get(f"http://localhost:8000/poet/{id}")
    print(response)
    if response.status_code == 200 and len(response.json())>0:
        return str(response.json()[0]['poetID']) + str(response.json()[0]['NameHZ'])
    else:
        return f"Poet{id} not found"

# 更新显示的数据
@app.callback(
    Output('work-distribution-chart', 'figure'),
    Output('work-name-output', 'children'),
    Input('work-distribution-chart', 'hoverData')
)
def update_graph_and_display_name(hoverData):
    # 绘制 Total 值的分布
    fig = px.line(jidata, x='totalWeight', y='count', text='count', title="Distribution of work Total Values")
   
    work_name = "Hover over a bar to see the work's name."
    if hoverData:
        total_value = float(hoverData['points'][0]['x'])
        print(total_value)
        ids = ji_log_reset[ji_log_reset['totalWeight'] == total_value]['workID'].tolist()
        work_names = "｜ ".join([fetch_work_from_api(id) for id in ids])
        
        work_name = f"work: {work_names}"
    return fig, work_name

@app.callback(
    Output('poet-distribution-chart', 'figure'),
    Output('poet-info-output', 'children'),
    Input('poet-distribution-chart', 'hoverData')
)

def update_poet_graph(hoverData):
    fig = px.line(poetdata, x='totalWeight', y='count', text='count', title="Distribution of Poet Weights")
    
    poet_name = "Hover over a bar to see poet details."
    if hoverData:
        total_value = float(hoverData['points'][0]['x'])
        ids = poet_log_reset[poet_log_reset['totalWeight'] == total_value]['poetID'].tolist()
        poet_names = "｜".join([fetch_poet_from_api(id) for id in ids])
        
        poet_name = f"Poet: {poet_names}"
    return fig, poet_name

# 运行应用
if __name__ == '__main__':
    app.run_server(debug=True)