# Harmonic Centrality

## Comments

In [1]:
import igraph as ig
from igraph import Graph
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Data/comments.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '评论者id'])

df['博主id'] = df['博主id'].astype(str)
df['评论者id'] = df['评论者id'].astype(str)

edge_weights = df.groupby(['评论者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['评论者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['评论者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

comments = Graph(directed=True)
comments.add_vertices(len(nodes))
comments.add_edges(edges)
comments.es['weight'] = weights
comments.vs['name'] = nodes

print(comments.summary())

IGRAPH DNW- 20433 22346 -- 
+ attr: name (v), weight (e)


In [3]:
def compute_harmonic_centrality(g):
    weights = g.es['weight']
    distances = [1.0 / w if w != 0 else float('inf') for w in weights]

    harmonic_centrality = []
    for v in range(g.vcount()):
        dists = g.distances(source=v, weights=distances)[0]
        hc = sum(1.0 / d for i, d in enumerate(dists) if i != v and d != float('inf') and d > 0)
        harmonic_centrality.append(hc)

    g.vs['harmonic_centrality'] = harmonic_centrality
    return harmonic_centrality

In [4]:
comments_hc_scores = compute_harmonic_centrality(comments)

In [5]:
top_indices = sorted(range(len(comments_hc_scores)), key=lambda i: comments_hc_scores[i], reverse=True)
for i in top_indices[:1000]:
    print(f"User ID: {comments.vs[i]['name']}, Harmonic Centrality: {comments_hc_scores[i]:.3f}")

User ID: 6405768113, Harmonic Centrality: 47.000
User ID: 7871051600, Harmonic Centrality: 46.000
User ID: 6430287371, Harmonic Centrality: 40.000
User ID: 2800381265, Harmonic Centrality: 38.917
User ID: 2182315120, Harmonic Centrality: 32.000
User ID: 7767236233, Harmonic Centrality: 32.000
User ID: 7270364772, Harmonic Centrality: 27.000
User ID: 7317045662, Harmonic Centrality: 26.000
User ID: 5878659096, Harmonic Centrality: 25.000
User ID: 6390507591, Harmonic Centrality: 24.000
User ID: 7396704374, Harmonic Centrality: 24.000
User ID: 7775885745, Harmonic Centrality: 24.000
User ID: 7337305167, Harmonic Centrality: 23.000
User ID: 1965249101, Harmonic Centrality: 21.000
User ID: 5189509833, Harmonic Centrality: 21.000
User ID: 5304594084, Harmonic Centrality: 21.000
User ID: 6733786895, Harmonic Centrality: 21.000
User ID: 7744586537, Harmonic Centrality: 21.000
User ID: 5340885955, Harmonic Centrality: 20.000
User ID: 6322835903, Harmonic Centrality: 20.000
User ID: 5690140790,

## Likes

In [6]:
df = pd.read_csv("Data/likes.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '点赞者id'])

df['博主id'] = df['博主id'].astype(str)
df['点赞者id'] = df['点赞者id'].astype(str)

edge_weights = df.groupby(['点赞者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['点赞者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['点赞者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

likes = Graph(directed=True)
likes.add_vertices(len(nodes))
likes.add_edges(edges)
likes.es['weight'] = weights
likes.vs['name'] = nodes

print(likes.summary())

IGRAPH DNW- 59904 72199 -- 
+ attr: name (v), weight (e)


In [7]:
likes_hc_scores = compute_harmonic_centrality(likes)

In [8]:
top_indices = sorted(range(len(likes_hc_scores)), key=lambda i: likes_hc_scores[i], reverse=True)
for i in top_indices[:1000]:
    print(f"User ID: {likes.vs[i]['name']}, Harmonic Centrality: {likes_hc_scores[i]:.3f}")

User ID: 7906744658, Harmonic Centrality: 76.167
User ID: 6135197490, Harmonic Centrality: 40.667
User ID: 7691618007, Harmonic Centrality: 29.333
User ID: 2610805335, Harmonic Centrality: 29.167
User ID: 6086618168, Harmonic Centrality: 27.833
User ID: 7677657409, Harmonic Centrality: 25.667
User ID: 7616635971, Harmonic Centrality: 23.750
User ID: 6388185471, Harmonic Centrality: 23.500
User ID: 7918794877, Harmonic Centrality: 22.000
User ID: 3820201922, Harmonic Centrality: 21.500
User ID: 7980491654, Harmonic Centrality: 21.500
User ID: 7745172076, Harmonic Centrality: 21.333
User ID: 7332914222, Harmonic Centrality: 21.183
User ID: 6967821129, Harmonic Centrality: 21.167
User ID: 6923780633, Harmonic Centrality: 20.500
User ID: 7783955057, Harmonic Centrality: 20.500
User ID: 6491875085, Harmonic Centrality: 19.833
User ID: 7958499755, Harmonic Centrality: 19.667
User ID: 7164127180, Harmonic Centrality: 19.483
User ID: 6574146080, Harmonic Centrality: 19.350
User ID: 7334252920,

## Reposts

In [9]:
df = pd.read_csv("Data/reposts.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '转发者id'])

df['博主id'] = df['博主id'].astype(str)
df['转发者id'] = df['转发者id'].astype(str)

edge_weights = df.groupby(['转发者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['转发者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['转发者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

reposts = Graph(directed=True)
reposts.add_vertices(len(nodes))
reposts.add_edges(edges)
reposts.es['weight'] = weights
reposts.vs['name'] = nodes

print(reposts.summary())

IGRAPH DNW- 35051 38242 -- 
+ attr: name (v), weight (e)


In [11]:
reposts_hc_scores = compute_harmonic_centrality(reposts)

In [12]:
top_indices = sorted(range(len(reposts_hc_scores)), key=lambda i: reposts_hc_scores[i], reverse=True)
for i in top_indices[:1000]:
    print(f"User ID: {reposts.vs[i]['name']}, Harmonic Centrality: {reposts_hc_scores[i]:.3f}")

User ID: 6341763027, Harmonic Centrality: 17.000
User ID: 7773384197, Harmonic Centrality: 17.000
User ID: 7539322109, Harmonic Centrality: 16.300
User ID: 7366554482, Harmonic Centrality: 16.000
User ID: 1494892985, Harmonic Centrality: 15.000
User ID: 6220718386, Harmonic Centrality: 15.000
User ID: 6623274254, Harmonic Centrality: 13.000
User ID: 7743914105, Harmonic Centrality: 13.000
User ID: 6334540138, Harmonic Centrality: 12.000
User ID: 6472525547, Harmonic Centrality: 11.500
User ID: 5344838449, Harmonic Centrality: 11.000
User ID: 5447157137, Harmonic Centrality: 11.000
User ID: 7795994811, Harmonic Centrality: 11.000
User ID: 2211039297, Harmonic Centrality: 10.000
User ID: 3877775687, Harmonic Centrality: 10.000
User ID: 5054608559, Harmonic Centrality: 10.000
User ID: 5851993210, Harmonic Centrality: 10.000
User ID: 6213456209, Harmonic Centrality: 10.000
User ID: 6266954016, Harmonic Centrality: 10.000
User ID: 6569600354, Harmonic Centrality: 10.000
User ID: 7524160974,

# Saving to CSV files

## Comments

In [33]:
# Prepare data with harmonic centrality scores for all users
all_users_data = []
for idx, score in enumerate(comments_hc_scores):
    user_id = comments.vs[idx]['name']
    all_users_data.append({'博主id': user_id, 'score': score})
all_users_df = pd.DataFrame(all_users_data)

# Load user metadata
user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    usecols=lambda col: col in user_columns,
    low_memory=False
).drop_duplicates(subset='用户id', keep='first')

# Clean ID columns
all_users_df['博主id'] = all_users_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

# Merge centrality scores with user metadata
merged_df = all_users_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

# Check for missing users
missing_users = merged_df[merged_df['性别'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

# Reorder columns for export
columns = ['博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级', 'score']
final_df = merged_df[columns]

# Save to CSV
final_df.to_csv("Harmonic_Centrality_results/all_influential_commenters.csv", index=False, encoding='utf-8-sig')
print("Saved: Harmonic_Centrality_results/all_influential_commenters.csv")

Users not found in user.csv: 34
            博主id  score
783   1700240933    1.0
4393  3029439173    1.0
6308  5150964762    1.0
7289  5413781135    1.0
7392  5458281063    1.0
Saved: Harmonic_Centrality_results/all_influential_commenters.csv


## Likes

In [34]:
# Prepare data with harmonic centrality scores for all users
all_users_data = []
for idx, score in enumerate(likes_hc_scores):
    user_id = likes.vs[idx]['name']
    all_users_data.append({'博主id': user_id, 'score': score})
all_users_df = pd.DataFrame(all_users_data)

# Load user metadata
user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    usecols=lambda col: col in user_columns,
    low_memory=False
).drop_duplicates(subset='用户id', keep='first')

# Clean ID columns
all_users_df['博主id'] = all_users_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

# Merge centrality scores with user metadata
merged_df = all_users_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

# Check for missing users
missing_users = merged_df[merged_df['性别'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

# Reorder columns for export
columns = ['博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级', 'score']
final_df = merged_df[columns]

# Save to CSV
final_df.to_csv("Harmonic_Centrality_results/all_influential_likers.csv", index=False, encoding='utf-8-sig')
print("Saved: Harmonic_Centrality_results/all_influential_likers.csv")

Users not found in user.csv: 0
Saved: Harmonic_Centrality_results/all_influential_likers.csv


## Reposts

In [35]:
# Prepare data with harmonic centrality scores for all users
all_users_data = []
for idx, score in enumerate(reposts_hc_scores):
    user_id = reposts.vs[idx]['name']
    all_users_data.append({'博主id': user_id, 'score': score})
all_users_df = pd.DataFrame(all_users_data)

# Load user metadata
user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    usecols=lambda col: col in user_columns,
    low_memory=False
).drop_duplicates(subset='用户id', keep='first')

# Clean ID columns
all_users_df['博主id'] = all_users_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

# Merge centrality scores with user metadata
merged_df = all_users_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

# Check for missing users
missing_users = merged_df[merged_df['性别'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

# Reorder columns for export
columns = ['博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级', 'score']
final_df = merged_df[columns]

# Save to CSV
final_df.to_csv("Harmonic_Centrality_results/all_influential_reposters.csv", index=False, encoding='utf-8-sig')
print("Saved: Harmonic_Centrality_results/all_influential_reposters.csv")

Users not found in user.csv: 0
Saved: Harmonic_Centrality_results/all_influential_reposters.csv


# Done!