In [30]:
import pandas as pd
import numpy as np

def customized_pagerank(relations_df, max_iterations=100, alpha=0.85, tolerance=1e-6):
    """
    多加了两个argument：
      alpha: 阻尼因子
      tolerance: 收敛值

    返回：各文档的信任得分
    """
    # 提取所有的文档 ID
    document_ids = set(relations_df['doc_id1']).union(set(relations_df['doc_id2']))
    document_ids = sorted(document_ids)
    doc_index = {doc_id: idx for idx, doc_id in enumerate(document_ids)}
    index_doc = {idx: doc_id for doc_id, idx in doc_index.items()}
    N = len(document_ids)

    # 初始化信任度向量 s_d0
    s = np.full(N, 0.5)
    s0 = s.copy()

    # 添加可信文档
    trusted_docs = []
    for doc_id in trusted_docs:
        if doc_id in doc_index:  # 检查是否在 doc_index 中
            idx = doc_index[doc_id]
            s[idx] = 1.0
            s0[idx] = 1.0

    # 正向和负向的邻接矩阵
    W_plus = np.zeros((N, N))
    W_minus = np.zeros((N, N))

    # 填充实际关系
    for _, row in relations_df.iterrows():
        doc_id1 = row['doc_id1']
        doc_id2 = row['doc_id2']
        normalized_relation = row['normalized_relation']
        i = doc_index[doc_id1]
        j = doc_index[doc_id2]
        if normalized_relation > 0:
            W_plus[i, j] += normalized_relation
        elif normalized_relation < 0:
            W_minus[i, j] -= normalized_relation
    '''
    # 添加默认关系
    default_relation = 0.001
    for i in range(N):
        for j in range(N):
            if i != j and (document_ids[i], document_ids[j]) not in relations_df[['doc_id1', 'doc_id2']].values:
                W_plus[i, j] += default_relation
    '''
    # 计算每个文档的正向和负向权重总和
    W_plus_sum = W_plus.sum(axis=0)
    W_minus_sum = W_minus.sum(axis=0)

    for iteration in range(max_iterations):
        s_prev = s.copy()
        P_pos = np.zeros(N)
        N_neg = np.zeros(N)

        # 正负向影响
        for d in range(N):

            if W_plus_sum[d] > 0:
                P_pos[d] = np.dot(W_plus[:, d], s_prev) / W_plus_sum[d]
            else:
                P_pos[d] = 0
            '''
            if W_minus_sum[d] > 0:
                N_neg[d] = np.dot(W_minus[:, d], s_prev) / W_minus_sum[d]
            else:
                N_neg[d] = 0
            '''

        # 净影响 + 映射
        I = P_pos - N_neg
        f_I = (I + 1) / 2
        s = (1 - alpha) * s0 + alpha * f_I

        # 收敛判断
        delta = np.linalg.norm(s - s_prev, ord=1)
        if delta < tolerance:
            print(f'Converged at round {iteration+1}，quantity of change: {delta}')
            break

    scores = {index_doc[idx]: s[idx] for idx in range(N)}
    return scores


In [None]:
def claim_relations_to_doc_relations(claim_relations_df):
  claim_relations_df['doc_id1'] = claim_relations_df['id1'].str[:4]
  claim_relations_df['doc_id2'] = claim_relations_df['id2'].str[:4]

  doc_relation_counts = claim_relations_df.groupby(['doc_id1', 'doc_id2'])['relation'].agg(
      support_count = lambda x: (x == 1).sum(),
      oppose_count = lambda x: (x == -1).sum()
  ).reset_index()

  doc_relation_counts['net_relation'] = doc_relation_counts['support_count'] - doc_relation_counts['oppose_count']
  max_net_relation = doc_relation_counts['net_relation'].abs().max()
  doc_relation_counts['normalized_relation'] = doc_relation_counts['net_relation'] / max_net_relation if max_net_relation != 0 else 0

  document_relations_df = doc_relation_counts[['doc_id1', 'doc_id2', 'normalized_relation']]

  return document_relations_df

In [None]:
!rm -rf ClaimRank
!git clone https://github.com/Averyyy/ClaimRank

Cloning into 'ClaimRank'...
remote: Enumerating objects: 340, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 340 (delta 31), reused 39 (delta 15), pack-reused 277 (from 1)[K
Receiving objects: 100% (340/340), 54.16 MiB | 21.91 MiB/s, done.
Resolving deltas: 100% (148/148), done.


In [31]:
import pandas as pd

df = pd.read_csv('ClaimRank/dataset/relations_latest.csv')
df = pd.read_csv('ClaimRank/dataset/relations_latest.csv', usecols=['id1', 'id2', 'relation'], dtype={'id1': str, 'id2': str})
print(df.columns)
document_relations_df = claim_relations_to_doc_relations(df)
print(document_relations_df.columns)
scores = customized_pagerank(document_relations_df, max_iterations=100)
print(len(scores))
for doc_id, score in scores.items():
    print(f'Document {doc_id} \'s score: {score:.4f}')


Index(['id1', 'id2', 'relation'], dtype='object')
Index(['doc_id1', 'doc_id2', 'normalized_relation'], dtype='object')
Converged at round 22，quantity of change: 8.371551583064019e-07
372
Document 0000 's score: 0.8696
Document 0001 's score: 0.8696
Document 0002 's score: 0.8696
Document 0003 's score: 0.8696
Document 0004 's score: 0.8696
Document 0005 's score: 0.5000
Document 0006 's score: 0.8696
Document 0007 's score: 0.8696
Document 0008 's score: 0.8696
Document 0009 's score: 0.8696
Document 0010 's score: 0.8696
Document 0012 's score: 0.8696
Document 0013 's score: 0.8696
Document 0014 's score: 0.5000
Document 0015 's score: 0.8696
Document 0016 's score: 0.5000
Document 0017 's score: 0.8696
Document 0018 's score: 0.8696
Document 0020 's score: 0.8696
Document 0021 's score: 0.8696
Document 0022 's score: 0.8696
Document 0023 's score: 0.5000
Document 0024 's score: 0.7125
Document 0025 's score: 0.5000
Document 0026 's score: 0.8696
Document 0027 's score: 0.8696
Documen

In [None]:
n = 0
for doc_id, score in scores.items():
    if score == 0.5:
        n += 1
        print(f'文档 {doc_id} 的信任得分为 {score:.4f}')
print(n)

文档 0005 的信任得分为 0.5000
文档 0014 的信任得分为 0.5000
文档 0016 的信任得分为 0.5000
文档 0023 的信任得分为 0.5000
文档 0025 的信任得分为 0.5000
文档 0038 的信任得分为 0.5000
文档 0041 的信任得分为 0.5000
文档 0042 的信任得分为 0.5000
文档 0046 的信任得分为 0.5000
文档 0048 的信任得分为 0.5000
文档 0055 的信任得分为 0.5000
文档 0066 的信任得分为 0.5000
文档 0081 的信任得分为 0.5000
文档 0083 的信任得分为 0.5000
文档 0084 的信任得分为 0.5000
文档 0094 的信任得分为 0.5000
文档 0127 的信任得分为 0.5000
文档 0138 的信任得分为 0.5000
文档 0722 的信任得分为 0.5000
19


In [33]:
import pandas as pd

filtered_data_df = pd.read_csv('ClaimRank/dataset/Filtered_data.csv', dtype={'id': str}, encoding='latin1')
validity_counts = filtered_data_df['validity'].value_counts()

print(f"Validity 为 1 的文档数量: {validity_counts.get(1, 0)}")
print(f"Validity 为 0 的文档数量: {validity_counts.get(0, 0)}")

scores_df = pd.DataFrame(scores.items(), columns=['id', 'score'])

# 筛选得分为 0.5 的文档
filtered_scores_df = scores_df[scores_df['score'] == 0.5]
merged_df = pd.merge(filtered_scores_df, filtered_data_df, on='id', how='inner')
validity_is_one_df = merged_df[merged_df['validity'] == 1]

# 输出结果
print(f"得分为 0.5 且 validity 为 1 的文档数量: {len(validity_is_one_df)}")
print(validity_is_one_df[['id', 'title', 'validity']])


Validity 为 1 的文档数量: 541
Validity 为 0 的文档数量: 273
得分为 0.5 且 validity 为 1 的文档数量: 28
      id                                              title  validity
0   0005  U.S. environmental agency to offer buyouts to ...         1
1   0014  Treasury chief 'confident' Congress will raise...         1
2   0016  Trump budget chief under fire over attack on C...         1
3   0023  FCC to vote on revising rules on identifying t...         1
4   0025  Top House Intel panel Democrat rebukes chairma...         1
5   0038  Trump administration approves tougher visa vet...         1
6   0041  White House unveils list of ex-lobbyists grant...         1
7   0042  Former Vice President Biden to announce politi...         1
8   0046  Clinton says Trump campaign likely guided Russ...         1
9   0048  Trump expected to delay U.S. embassy move to J...         1
10  0054  White House declines to say if Trump has made ...         1
11  0055  VA chief presses Congress to make it easier to...         1
12  0066 