# Harmonic Centrality

## Comments

In [16]:
import igraph as ig
from igraph import Graph
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import numpy as np

In [2]:
df = pd.read_csv("Data/comments.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '评论者id'])

df['博主id'] = df['博主id'].astype(str)
df['评论者id'] = df['评论者id'].astype(str)

edge_weights = df.groupby(['评论者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['评论者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['评论者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

comments = Graph(directed=True)
comments.add_vertices(len(nodes))
comments.add_edges(edges)
comments.es['weight'] = weights
comments.vs['name'] = nodes

print(comments.summary())

IGRAPH DNW- 20433 22346 -- 
+ attr: name (v), weight (e)


In [3]:
def compute_harmonic_centrality(g):
    weights = g.es['weight']
    distances = [1.0 / w if w != 0 else float('inf') for w in weights]

    harmonic_centrality = []
    for v in range(g.vcount()):
        dists = g.distances(source=v, weights=distances)[0]
        hc = sum(1.0 / d for i, d in enumerate(dists) if i != v and d != float('inf') and d > 0)
        harmonic_centrality.append(hc)

    g.vs['harmonic_centrality'] = harmonic_centrality
    return harmonic_centrality

In [4]:
comments_hc_scores = compute_harmonic_centrality(comments)

In [26]:
top_comments_indices = sorted(enumerate(comments_hc_scores), key=lambda x: x[1], reverse=True)
for rank, (user_id, score) in enumerate(top_comments_indices[:1000], start=1):
    print(f"{rank:<6}{user_id:<10}{score:>10.8f}")

1     11889     47.00000000
2     18842     46.00000000
3     11978     40.00000000
4     3959      38.91666667
5     2565      32.00000000
6     17593     32.00000000
7     14545     27.00000000
8     14855     26.00000000
9     9497      25.00000000
10    11827     24.00000000
11    15302     24.00000000
12    17710     24.00000000
13    14976     23.00000000
14    2012      21.00000000
15    6415      21.00000000
16    6870      21.00000000
17    13315     21.00000000
18    17350     21.00000000
19    7026      20.00000000
20    11449     20.00000000
21    8652      19.00000000
22    10365     19.00000000
23    3578      18.00000000
24    7639      18.00000000
25    14827     18.00000000
26    12374     17.00000000
27    14894     17.00000000
28    2820      16.00000000
29    11847     16.00000000
30    13212     16.00000000
31    18122     16.00000000
32    1452      15.00000000
33    4865      15.00000000
34    6085      15.00000000
35    7300      15.00000000
36    9915      14.5

## Likes

In [6]:
df = pd.read_csv("Data/likes.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '点赞者id'])

df['博主id'] = df['博主id'].astype(str)
df['点赞者id'] = df['点赞者id'].astype(str)

edge_weights = df.groupby(['点赞者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['点赞者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['点赞者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

likes = Graph(directed=True)
likes.add_vertices(len(nodes))
likes.add_edges(edges)
likes.es['weight'] = weights
likes.vs['name'] = nodes

print(likes.summary())

IGRAPH DNW- 59904 72199 -- 
+ attr: name (v), weight (e)


In [7]:
likes_hc_scores = compute_harmonic_centrality(likes)

In [27]:
top_likes_indices = sorted(enumerate(likes_hc_scores), key=lambda x: x[1], reverse=True)
for rank, (user_id, score) in enumerate(top_likes_indices[:1000], start=1):
    print(f"{rank:<6}{user_id:<10}{score:>10.8f}")

1     50674     76.16666667
2     21329     40.66666667
3     41492     29.33333333
4     5158      29.16666667
5     20706     27.83333333
6     41322     25.66666667
7     40340     23.75000000
8     24370     23.50000000
9     51357     22.00000000
10    8777      21.50000000
11    58804     21.50000000
12    43067     21.33333333
13    34203     21.18333333
14    30237     21.16666667
15    30050     20.50000000
16    44997     20.50000000
17    25873     19.83333333
18    57738     19.66666667
19    31689     19.48333333
20    26985     19.35000000
21    34231     18.73333333
22    1811      18.66666667
23    15962     17.83333333
24    7309      17.66666667
25    10723     17.65000000
26    47185     17.51666667
27    9549      17.50000000
28    20243     17.50000000
29    47666     17.50000000
30    49291     17.50000000
31    17246     17.16666667
32    19518     17.16666667
33    20947     17.16666667
34    42194     17.00000000
35    18142     16.91666667
36    11079     16.6

## Reposts

In [9]:
df = pd.read_csv("Data/reposts.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '转发者id'])

df['博主id'] = df['博主id'].astype(str)
df['转发者id'] = df['转发者id'].astype(str)

edge_weights = df.groupby(['转发者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['转发者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['转发者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

reposts = Graph(directed=True)
reposts.add_vertices(len(nodes))
reposts.add_edges(edges)
reposts.es['weight'] = weights
reposts.vs['name'] = nodes

print(reposts.summary())

IGRAPH DNW- 35051 38242 -- 
+ attr: name (v), weight (e)


In [10]:
reposts_hc_scores = compute_harmonic_centrality(reposts)

In [28]:
top_repost_indices = sorted(enumerate(reposts_hc_scores), key=lambda x: x[1], reverse=True)
for rank, (user_id, score) in enumerate(top_repost_indices[:1000], start=1):
    print(f"{rank:<6}{user_id:<10}{score:>10.8f}")

1     20473     17.00000000
2     30961     17.00000000
3     28605     16.30000000
4     26593     16.00000000
5     727       15.00000000
6     19573     15.00000000
7     23143     13.00000000
8     30362     13.00000000
9     20397     12.00000000
10    21522     11.50000000
11    12511     11.00000000
12    13038     11.00000000
13    31367     11.00000000
14    4744      10.00000000
15    9962      10.00000000
16    10653     10.00000000
17    16444     10.00000000
18    19538     10.00000000
19    19821     10.00000000
20    22427     10.00000000
21    28463     10.00000000
22    32047     10.00000000
23    32651     10.00000000
24    12502     9.00000000
25    19134     9.00000000
26    22210     9.00000000
27    23313     9.00000000
28    30155     9.00000000
29    30649     9.00000000
30    34190     9.00000000
31    8738      8.50000000
32    865       8.00000000
33    7805      8.00000000
34    18984     8.00000000
35    22602     8.00000000
36    30789     8.00000000
37   

# Saving to CSV files

## Comments

In [29]:
# Prepare data with harmonic centrality scores for all users
all_users_data = []
for rank, (idx, score) in enumerate(top_comments_indices[:1000], start=1):
    user_id = comments.vs[idx]['name']
    all_users_data.append({'Rank': rank, '博主id': user_id, 'score': score})
all_users_df = pd.DataFrame(all_users_data)

# Load user metadata
user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    usecols=lambda col: col in user_columns,
    low_memory=False
).drop_duplicates(subset='用户id', keep='first')

# Clean ID columns
all_users_df['博主id'] = all_users_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

# Merge centrality scores with user metadata
merged_df = all_users_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

# Check for missing users
missing_users = merged_df[merged_df['性别'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

# Reorder columns for export
columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级', 'score']
final_ordered_comments_df = merged_df[columns]

# Save to CSV
final_ordered_comments_df.to_csv("Harmonic_Centrality_results/all_influential_commenters.csv", index=False, encoding='utf-8-sig')
print("Saved: Harmonic_Centrality_results/all_influential_commenters.csv")

Users not found in user.csv: 0
Saved: Harmonic_Centrality_results/all_influential_commenters.csv


## Likes

In [31]:
# Prepare data with harmonic centrality scores for all users
all_users_data = []
for rank, (idx, score) in enumerate(top_likes_indices[:1000], start=1):
    user_id = likes.vs[idx]['name']
    all_users_data.append({'Rank': rank, '博主id': user_id, 'score': score})
all_users_df = pd.DataFrame(all_users_data)

# Load user metadata
user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    usecols=lambda col: col in user_columns,
    low_memory=False
).drop_duplicates(subset='用户id', keep='first')

# Clean ID columns
all_users_df['博主id'] = all_users_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

# Merge centrality scores with user metadata
merged_df = all_users_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

# Check for missing users
missing_users = merged_df[merged_df['性别'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

# Reorder columns for export
columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级', 'score']
final_ordered_likes_df = merged_df[columns]

# Save to CSV
final_ordered_likes_df.to_csv("Harmonic_Centrality_results/all_influential_likers.csv", index=False, encoding='utf-8-sig')
print("Saved: Harmonic_Centrality_results/all_influential_likers.csv")

Users not found in user.csv: 0
Saved: Harmonic_Centrality_results/all_influential_likers.csv


## Reposts

In [32]:
# Prepare data with harmonic centrality scores for all users
all_users_data = []
for rank, (idx, score) in enumerate(top_repost_indices[:1000], start=1):
    user_id = reposts.vs[idx]['name']
    all_users_data.append({'Rank': rank, '博主id': user_id, 'score': score})
all_users_df = pd.DataFrame(all_users_data)

# Load user metadata
user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    usecols=lambda col: col in user_columns,
    low_memory=False
).drop_duplicates(subset='用户id', keep='first')

# Clean ID columns
all_users_df['博主id'] = all_users_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

# Merge centrality scores with user metadata
merged_df = all_users_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

# Check for missing users
missing_users = merged_df[merged_df['性别'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

# Reorder columns for export
columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级', 'score']
final_ordered_reposts_df = merged_df[columns]

# Save to CSV
final_ordered_reposts_df.to_csv("Harmonic_Centrality_results/all_influential_reposters.csv", index=False, encoding='utf-8-sig')
print("Saved: Harmonic_Centrality_results/all_influential_reposters.csv")

Users not found in user.csv: 0
Saved: Harmonic_Centrality_results/all_influential_reposters.csv


# Stats

## Comments

In [59]:
indegree = top_comments_indices
final_ordered_df = final_ordered_comments_df

indegree_data = []
for rank, (idx, score) in enumerate(indegree, start=1):
    user_id = comments.vs[idx]['name']
    indegree_data.append({'Rank': rank, '博主id': user_id, 'score': score})
indegree_df = pd.DataFrame(indegree_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

indegree_df['博主id'] = indegree_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = indegree_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]


print("Ranked percentage:")
print(f"Females: {sum(final_ordered_df["性别"] == "f") / len(final_ordered_df)}")
print(f"Males: {sum(final_ordered_df["性别"] == "m") / len(final_ordered_df)} \n")

print(f"Influential: {sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True) / len(final_ordered_df)}")
print(f"Not influential: {sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False) / len(final_ordered_df)}\n")

print(f"Official: {sum(final_ordered_df["微博认证"].map(str).str[-2:] == '微博') / len(final_ordered_df)}")
print(f"Not Official: {sum(~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博')) / len(final_ordered_df)} \n")



print("Average rank:")
print(f"Females: {sum(final_ordered_df.loc[final_ordered_df["性别"] == "f", "Rank"]) / sum(final_ordered_df["性别"] == "f")}")
print(f"Males: {sum(final_ordered_df.loc[final_ordered_df["性别"] == "m", "Rank"]) / sum(final_ordered_df["性别"] == "m")}\n")

print(f"Influential: {sum(final_ordered_df.loc[final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True, "Rank"]) / sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True)}")
print(f"Not influential: {sum(final_ordered_df.loc[final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False, "Rank"]) / sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False)}\n")

official_count = sum(final_ordered_df["微博认证"].map(str).str[-2:] == '微博')
if official_count > 0:
    official_mean = sum(final_ordered_df.loc[final_ordered_df["微博认证"].map(str).str[-2:] == '微博', "Rank"]) / official_count
else:
    official_mean = 0  # Or another default value
print(f"Official: {official_mean}")
print(f"Not Official: {sum(final_ordered_df.loc[~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博'), "Rank"]) / sum(~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博'))}\n")



print("All:")
print(f"Females: {sum(final_df["性别"] == "f") / len(final_df)}")
print(f"Males: {sum(final_df["性别"] == "m") / len(final_df)} \n")

print(f"Influential: {sum(final_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True) / len(final_df)}")
print(f"Not influential: {sum(final_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False) / len(final_df)} \n")

print(f"Official: {sum(final_df["微博认证"].map(str).str[-2:] == '微博') / len(final_df)}")
print(f"Not Official: {sum(~(final_df["微博认证"].map(str).str[-2:] == '微博')) / len(final_df)} \n")


Ranked percentage:
Females: 0.723
Males: 0.277 

Influential: 0.288
Not influential: 0.712

Official: 0.0
Not Official: 1.0 

Average rank:
Females: 518.7994467496542
Males: 452.7364620938628

Influential: 584.8784722222222
Not influential: 466.3693820224719

Official: 0
Not Official: 500.5

All:
Females: 0.7670924484901874
Males: 0.23119463612783242 

Influential: 0.10375373170851074
Not influential: 0.8962462682914892 

Official: 0.0013213918660989576
Not Official: 0.9986786081339011 



In [60]:
var = final_ordered_df["svip等级"].map(int)
all_var = final_df["svip等级"].apply(lambda x: int(x) if pd.notna(x) else x)
print("Vip Degree:")
print("0:")
print(f"ranked percentage: {sum(var == 0) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 0, "Rank"]) / sum(var == 0)}")
print(f"All: {sum(all_var == 0) / len(final_df)} \n")

print("1:")
print(f"ranked percentage: {sum(var == 1) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 1, "Rank"]) / sum(var == 1)}")
print(f"All: {sum(all_var == 1) / len(final_df)} \n")

print("2:")
print(f"ranked percentage: {sum(var == 2) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 2, "Rank"]) / sum(var == 2)}")
print(f"All: {sum(all_var == 2) / len(final_df)} \n")

print("3:")
print(f"ranked percentage: {sum(var == 3) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 3, "Rank"]) / sum(var == 3)}")
print(f"All: {sum(all_var == 3) / len(final_df)} \n")

print("4:")
print(f"ranked percentage: {sum(var == 4) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 4, "Rank"]) / sum(var == 4)}")
print(f"All: {sum(all_var == 4) / len(final_df)} \n")

print("5:")
print(f"ranked percentage: {sum(var == 5) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 5, "Rank"]) / sum(var == 5)}")
print(f"All: {sum(all_var == 5) / len(final_df)} \n")

print("6:")
print(f"ranked percentage: {sum(var == 6) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 6, "Rank"]) / sum(var == 6)}")
print(f"All: {sum(all_var == 6) / len(final_df)} \n")

print("7:")
print(f"ranked percentage: {sum(var == 7) / len(final_ordered_df)}")
count_var_7 = sum(var == 7)
if count_var_7 > 0:
    avg_rank_7 = sum(final_ordered_df.loc[var == 7, "Rank"]) / count_var_7
else:
    avg_rank_7 = 0  # Or another default value
print(f"avg rank: {avg_rank_7}")
print(f"All: {sum(all_var == 7) / len(final_df)} \n")

print("8:")
print(f"ranked percentage: {sum(var == 8) / len(final_ordered_df)}")
count_var_8 = sum(var == 8)
if count_var_8 > 0:
    avg_rank_8 = sum(final_ordered_df.loc[var == 8, "Rank"]) / count_var_8
else:
    avg_rank_8 = 0  # Or another default value
print(f"avg rank: {avg_rank_8}")
print(f"All: {sum(all_var == 8) / len(final_df)} \n")

Vip Degree:
0:
ranked percentage: 0.578
avg rank: 452.47058823529414
All: 0.7176136641707043 

1:
ranked percentage: 0.411
avg rank: 561.7226277372263
All: 0.24719815983947535 

2:
ranked percentage: 0.005
avg rank: 800.0
All: 0.01101159888415798 

3:
ranked percentage: 0.002
avg rank: 613.0
All: 0.004502520432633485 

4:
ranked percentage: 0.001
avg rank: 730.0
All: 0.0035237116429305535 

5:
ranked percentage: 0.001
avg rank: 584.0
All: 0.0034258307639602604 

6:
ranked percentage: 0.002
avg rank: 782.0
All: 0.006655899769979935 

7:
ranked percentage: 0.0
avg rank: 0
All: 0.0037684138403562863 

8:
ranked percentage: 0.0
avg rank: 0
All: 0.0005872852738217589 



## Likes

In [50]:
indegree = top_likes_indices
final_ordered_df = final_ordered_likes_df

indegree_data = []
for rank, (idx, score) in enumerate(indegree, start=1):
    user_id = likes.vs[idx]['name']
    indegree_data.append({'Rank': rank, '博主id': user_id, 'score': score})
indegree_df = pd.DataFrame(indegree_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

indegree_df['博主id'] = indegree_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = indegree_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]


print("Ranked percentage:")
print(f"Females: {sum(final_ordered_df["性别"] == "f") / len(final_ordered_df)}")
print(f"Males: {sum(final_ordered_df["性别"] == "m") / len(final_ordered_df)} \n")

print(f"Influential: {sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True) / len(final_ordered_df)}")
print(f"Not influential: {sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False) / len(final_ordered_df)}\n")

print(f"Official: {sum(final_ordered_df["微博认证"].map(str).str[-2:] == '微博') / len(final_ordered_df)}")
print(f"Not Official: {sum(~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博')) / len(final_ordered_df)} \n")



print("Average rank:")
print(f"Females: {sum(final_ordered_df.loc[final_ordered_df["性别"] == "f", "Rank"]) / sum(final_ordered_df["性别"] == "f")}")
print(f"Males: {sum(final_ordered_df.loc[final_ordered_df["性别"] == "m", "Rank"]) / sum(final_ordered_df["性别"] == "m")}\n")

print(f"Influential: {sum(final_ordered_df.loc[final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True, "Rank"]) / sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True)}")
print(f"Not influential: {sum(final_ordered_df.loc[final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False, "Rank"]) / sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False)}\n")

official_count = sum(final_ordered_df["微博认证"].map(str).str[-2:] == '微博')
if official_count > 0:
    official_mean = sum(final_ordered_df.loc[final_ordered_df["微博认证"].map(str).str[-2:] == '微博', "Rank"]) / official_count
else:
    official_mean = 0  # Or another default value
print(f"Official: {official_mean}")
print(f"Not Official: {sum(final_ordered_df.loc[~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博'), "Rank"]) / sum(~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博'))}\n")



print("All:")
print(f"Females: {sum(final_df["性别"] == "f") / len(final_df)}")
print(f"Males: {sum(final_df["性别"] == "m") / len(final_df)} \n")

print(f"Influential: {sum(final_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True) / len(final_df)}")
print(f"Not influential: {sum(final_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False) / len(final_df)} \n")

print(f"Official: {sum(final_df["微博认证"].map(str).str[-2:] == '微博') / len(final_df)}")
print(f"Not Official: {sum(~(final_df["微博认证"].map(str).str[-2:] == '微博')) / len(final_df)} \n")


Ranked percentage:
Females: 0.806
Males: 0.18 

Influential: 0.03
Not influential: 0.97

Official: 0.0
Not Official: 1.0 

Average rank:
Females: 507.4652605459057
Males: 475.68333333333334

Influential: 614.0666666666667
Not influential: 496.9876288659794

Official: 0
Not Official: 500.5

All:
Females: 0.7370793269230769
Males: 0.19920205662393162 

Influential: 0.032268295940170943
Not influential: 0.9677317040598291 

Official: 0.000717815170940171
Not Official: 0.9992821848290598 



In [56]:
var = final_ordered_df["svip等级"].map(int)
all_var = final_df["svip等级"].apply(lambda x: int(x) if pd.notna(x) else x)
print("Vip Degree:")
print("0:")
print(f"ranked percentage: {sum(var == 0) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 0, "Rank"]) / sum(var == 0)}")
print(f"All: {sum(all_var == 0) / len(final_df)} \n")

print("1:")
print(f"ranked percentage: {sum(var == 1) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 1, "Rank"]) / sum(var == 1)}")
print(f"All: {sum(all_var == 1) / len(final_df)} \n")

print("2:")
print(f"ranked percentage: {sum(var == 2) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 2, "Rank"]) / sum(var == 2)}")
print(f"All: {sum(all_var == 2) / len(final_df)} \n")

print("3:")
print(f"ranked percentage: {sum(var == 3) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 3, "Rank"]) / sum(var == 3)}")
print(f"All: {sum(all_var == 3) / len(final_df)} \n")

print("4:")
print(f"ranked percentage: {sum(var == 4) / len(final_ordered_df)}")
count_var_4 = sum(var == 4)
if count_var_4 > 0:
    avg_rank4 = sum(final_ordered_df.loc[var == 4, "Rank"]) / count_var_4
else:
    avg_rank4 = 0  # Or another default value
print(f"avg rank: {avg_rank4}")
print(f"All: {sum(all_var == 4) / len(final_df)} \n")

print("5:")
print(f"ranked percentage: {sum(var == 5) / len(final_ordered_df)}")
count_var_5 = sum(var == 5)
if count_var_5 > 0:
    avg_rank5 = sum(final_ordered_df.loc[var == 5, "Rank"]) / count_var_5
else:
    avg_rank5 = 0  # Or another default value
print(f"avg rank: {avg_rank5}")
print(f"All: {sum(all_var == 5) / len(final_df)} \n")

print("6:")
print(f"ranked percentage: {sum(var == 6) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 6, "Rank"]) / sum(var == 6)}")
print(f"All: {sum(all_var == 6) / len(final_df)} \n")

print("7:")
print(f"ranked percentage: {sum(var == 7) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 7, "Rank"]) / sum(var == 7)}")
print(f"All: {sum(all_var == 7) / len(final_df)} \n")

print("8:")
print(f"ranked percentage: {sum(var == 8) / len(final_ordered_df)}")
count_var_8 = sum(var == 8)
if count_var_8 > 0:
    avg_rank8 = sum(final_ordered_df.loc[var == 8, "Rank"]) / count_var_8
else:
    avg_rank8 = 0  # Or another default value
print(f"avg rank: {avg_rank8}")
print(f"All: {sum(all_var == 8) / len(final_df)} \n")

Vip Degree:
0:
ranked percentage: 0.914
avg rank: 494.9617067833698
All: 0.9345118856837606 

1:
ranked percentage: 0.075
avg rank: 563.9466666666667
All: 0.05361912393162393 

2:
ranked percentage: 0.006
avg rank: 526.6666666666666
All: 0.0038060897435897435 

3:
ranked percentage: 0.002
avg rank: 858.5
All: 0.0014189369658119658 

4:
ranked percentage: 0.0
avg rank: 0
All: 0.0011184561965811965 

5:
ranked percentage: 0.0
avg rank: 0
All: 0.0013187767094017095 

6:
ranked percentage: 0.001
avg rank: 307.0
All: 0.0025206997863247865 

7:
ranked percentage: 0.002
avg rank: 312.5
All: 0.001452323717948718 

8:
ranked percentage: 0.0
avg rank: 0
All: 0.00021701388888888888 



0

# Reposts

In [57]:
indegree = top_repost_indices
final_ordered_df = final_ordered_reposts_df

indegree_data = []
for rank, (idx, score) in enumerate(indegree, start=1):
    user_id = reposts.vs[idx]['name']
    indegree_data.append({'Rank': rank, '博主id': user_id, 'score': score})
indegree_df = pd.DataFrame(indegree_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

indegree_df['博主id'] = indegree_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = indegree_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]


print("Ranked percentage:")
print(f"Females: {sum(final_ordered_df["性别"] == "f") / len(final_ordered_df)}")
print(f"Males: {sum(final_ordered_df["性别"] == "m") / len(final_ordered_df)} \n")

print(f"Influential: {sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True) / len(final_ordered_df)}")
print(f"Not influential: {sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False) / len(final_ordered_df)}\n")

print(f"Official: {sum(final_ordered_df["微博认证"].map(str).str[-2:] == '微博') / len(final_ordered_df)}")
print(f"Not Official: {sum(~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博')) / len(final_ordered_df)} \n")



print("Average rank:")
print(f"Females: {sum(final_ordered_df.loc[final_ordered_df["性别"] == "f", "Rank"]) / sum(final_ordered_df["性别"] == "f")}")
print(f"Males: {sum(final_ordered_df.loc[final_ordered_df["性别"] == "m", "Rank"]) / sum(final_ordered_df["性别"] == "m")}\n")

print(f"Influential: {sum(final_ordered_df.loc[final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True, "Rank"]) / sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True)}")
print(f"Not influential: {sum(final_ordered_df.loc[final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False, "Rank"]) / sum(final_ordered_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False)}\n")

official_count = sum(final_ordered_df["微博认证"].map(str).str[-2:] == '微博')
if official_count > 0:
    official_mean = sum(final_ordered_df.loc[final_ordered_df["微博认证"].map(str).str[-2:] == '微博', "Rank"]) / official_count
else:
    official_mean = 0  # Or another default value
print(f"Official: {official_mean}")
print(f"Not Official: {sum(final_ordered_df.loc[~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博'), "Rank"]) / sum(~(final_ordered_df["微博认证"].map(str).str[-2:] == '微博'))}\n")



print("All:")
print(f"Females: {sum(final_df["性别"] == "f") / len(final_df)}")
print(f"Males: {sum(final_df["性别"] == "m") / len(final_df)} \n")

print(f"Influential: {sum(final_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == True) / len(final_df)}")
print(f"Not influential: {sum(final_df["是否大V"].str.lower().map({"true": True, "false": False}).fillna(False).astype(bool) == False) / len(final_df)} \n")

print(f"Official: {sum(final_df["微博认证"].map(str).str[-2:] == '微博') / len(final_df)}")
print(f"Not Official: {sum(~(final_df["微博认证"].map(str).str[-2:] == '微博')) / len(final_df)} \n")


Ranked percentage:
Females: 0.732
Males: 0.268 

Influential: 0.028
Not influential: 0.972

Official: 0.0
Not Official: 1.0 

Average rank:
Females: 502.38524590163934
Males: 495.35074626865674

Influential: 538.0
Not influential: 499.41975308641975

Official: 0
Not Official: 500.5

All:
Females: 0.768622863826995
Males: 0.23112036746455164 

Influential: 0.03997032894924538
Not influential: 0.9600296710507547 

Official: 0.0012553136857721605
Not Official: 0.9987446863142279 



In [58]:
var = final_ordered_df["svip等级"].map(int)
all_var = final_df["svip等级"].apply(lambda x: int(x) if pd.notna(x) else x)
print("Vip Degree:")
print("0:")
print(f"ranked percentage: {sum(var == 0) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 0, "Rank"]) / sum(var == 0)}")
print(f"All: {sum(all_var == 0) / len(final_df)} \n")

print("1:")
print(f"ranked percentage: {sum(var == 1) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 1, "Rank"]) / sum(var == 1)}")
print(f"All: {sum(all_var == 1) / len(final_df)} \n")

print("2:")
print(f"ranked percentage: {sum(var == 2) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 2, "Rank"]) / sum(var == 2)}")
print(f"All: {sum(all_var == 2) / len(final_df)} \n")

print("3:")
print(f"ranked percentage: {sum(var == 3) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 3, "Rank"]) / sum(var == 3)}")
print(f"All: {sum(all_var == 3) / len(final_df)} \n")

print("4:")
print(f"ranked percentage: {sum(var == 4) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 4, "Rank"]) / sum(var == 4)}")
print(f"All: {sum(all_var == 4) / len(final_df)} \n")

print("5:")
print(f"ranked percentage: {sum(var == 5) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 5, "Rank"]) / sum(var == 5)}")
print(f"All: {sum(all_var == 5) / len(final_df)} \n")

print("6:")
print(f"ranked percentage: {sum(var == 6) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 6, "Rank"]) / sum(var == 6)}")
print(f"All: {sum(all_var == 6) / len(final_df)} \n")

print("7:")
print(f"ranked percentage: {sum(var == 7) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 7, "Rank"]) / sum(var == 7)}")
print(f"All: {sum(all_var == 7) / len(final_df)} \n")

print("8:")
print(f"ranked percentage: {sum(var == 8) / len(final_ordered_df)}")
print(f"avg rank: {sum(final_ordered_df.loc[var == 8, "Rank"]) / sum(var == 8)}")
print(f"All: {sum(all_var == 8) / len(final_df)} \n")

Vip Degree:
0:
ranked percentage: 0.56
avg rank: 501.5678571428571
All: 0.718980913526005 

1:
ranked percentage: 0.268
avg rank: 491.57835820895525
All: 0.17802630452768822 

2:
ranked percentage: 0.039
avg rank: 533.8974358974359
All: 0.024079198881629627 

3:
ranked percentage: 0.011
avg rank: 424.1818181818182
All: 0.012267838292773387 

4:
ranked percentage: 0.017
avg rank: 522.4117647058823
All: 0.010869875324527118 

5:
ranked percentage: 0.021
avg rank: 628.6190476190476
All: 0.012695786140195715 

6:
ranked percentage: 0.034
avg rank: 443.61764705882354
All: 0.024193318307608912 

7:
ranked percentage: 0.041
avg rank: 513.219512195122
All: 0.015320532937719324 

8:
ranked percentage: 0.009
avg rank: 464.8888888888889
All: 0.003195343927420045 



# Done!