# Pagerank

## Comments

In [1]:
import igraph as ig
from igraph import Graph
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Data/comments.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '评论者id'])

df['博主id'] = df['博主id'].astype(str)
df['评论者id'] = df['评论者id'].astype(str)

edge_weights = df.groupby(['评论者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['评论者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['评论者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

comments = Graph(directed=True)
comments.add_vertices(len(nodes))
comments.add_edges(edges)
comments.es['weight'] = weights
comments.vs['name'] = nodes

print(comments.summary())

IGRAPH DNW- 20433 22346 -- 
+ attr: name (v), weight (e)


In [3]:
pagerank_scores = comments.pagerank(weights=comments.es["weight"])
comments_user_ranks = sorted(enumerate(pagerank_scores), key=lambda x: x[1], reverse=True)

print("Top 1000 Most Influential Commenters:")
print(f"{'Rank':<6}{'User ID':<10}{'Influence Score':>10}")
print("-" * 46)
for rank, (user_id, score) in enumerate(comments_user_ranks[:1000], start=1):
    print(f"{rank:<6}{user_id:<10}{score:>10.8f}")

Top 1000 Most Influential Commenters:
Rank  User ID   Influence Score
----------------------------------------------
1     10795     0.18565992
2     5714      0.06053434
3     419       0.05237322
4     13174     0.02775309
5     20168     0.02164879
6     20166     0.02092450
7     185       0.02070175
8     804       0.01722949
9     6784      0.01522040
10    14939     0.01420972
11    17468     0.01357796
12    2336      0.01223207
13    364       0.01061264
14    3942      0.01044082
15    20165     0.00898176
16    20179     0.00893416
17    20177     0.00836846
18    11544     0.00739041
19    17579     0.00700398
20    17438     0.00637559
21    17623     0.00602135
22    2166      0.00597601
23    1040      0.00580684
24    12049     0.00574750
25    20171     0.00565064
26    4082      0.00561759
27    8883      0.00522046
28    1372      0.00509392
29    856       0.00415035
30    882       0.00376392
31    18860     0.00363698
32    14515     0.00340479
33    4863      0.0

## Likes

In [4]:
df = pd.read_csv("Data/likes.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '点赞者id'])

df['博主id'] = df['博主id'].astype(str)
df['点赞者id'] = df['点赞者id'].astype(str)

edge_weights = df.groupby(['点赞者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['点赞者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['点赞者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

likes = Graph(directed=True)
likes.add_vertices(len(nodes))
likes.add_edges(edges)
likes.es['weight'] = weights
likes.vs['name'] = nodes

print(likes.summary())

IGRAPH DNW- 59904 72199 -- 
+ attr: name (v), weight (e)


In [5]:
pagerank_scores = likes.pagerank(weights=likes.es["weight"])
likes_user_ranks = sorted(enumerate(pagerank_scores), key=lambda x: x[1], reverse=True)

print("Top 1000 Most Influential Likers:")
print(f"{'Rank':<6}{'User ID':<10}{'Influence Score':>10}")
print("-" * 46)
for rank, (user_id, score) in enumerate(likes_user_ranks[:1000], start=1):
    print(f"{rank:<6}{user_id:<10}{score:>10.8f}")

Top 1000 Most Influential Likers:
Rank  User ID   Influence Score
----------------------------------------------
1     1040      0.03403381
2     13814     0.03326147
3     30700     0.02888170
4     47533     0.02707149
5     4688      0.02009003
6     37257     0.01870666
7     37644     0.01623289
8     7858      0.01595365
9     709       0.01430945
10    5955      0.01331080
11    32703     0.01210240
12    41025     0.01101425
13    40585     0.01054385
14    1312      0.01013568
15    29864     0.01009383
16    1327      0.01007236
17    4006      0.00985986
18    34170     0.00836640
19    2479      0.00815943
20    15772     0.00793060
21    2543      0.00762768
22    11372     0.00654608
23    2443      0.00587641
24    34675     0.00522050
25    59237     0.00488712
26    9895      0.00467474
27    18606     0.00445039
28    59231     0.00431518
29    46244     0.00415087
30    59221     0.00414333
31    19708     0.00404317
32    59189     0.00402106
33    59219     0.00399

## Reposts

In [6]:
df = pd.read_csv("Data/reposts.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '转发者id'])

df['博主id'] = df['博主id'].astype(str)
df['转发者id'] = df['转发者id'].astype(str)

edge_weights = df.groupby(['转发者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['转发者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['转发者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

reposts = Graph(directed=True)
reposts.add_vertices(len(nodes))
reposts.add_edges(edges)
reposts.es['weight'] = weights
reposts.vs['name'] = nodes

print(reposts.summary())

IGRAPH DNW- 35051 38242 -- 
+ attr: name (v), weight (e)


In [7]:
pagerank_scores = reposts.pagerank(weights=reposts.es["weight"])
reposts_user_ranks = sorted(enumerate(pagerank_scores), key=lambda x: x[1], reverse=True)

print("Top 1000 Most Influential Reposters:")
print(f"{'Rank':<6}{'User ID':<10}{'Influence Score':>10}")
print("-" * 46)
for rank, (user_id, score) in enumerate(reposts_user_ranks[:1000], start=1):
    print(f"{rank:<6}{user_id:<10}{score:>10.8f}")

Top 1000 Most Influential Reposters:
Rank  User ID   Influence Score
----------------------------------------------
1     34794     0.10059008
2     34793     0.08813686
3     337       0.06623447
4     34800     0.03601776
5     34798     0.02718894
6     34808     0.01891919
7     34795     0.01719039
8     34825     0.01478394
9     34802     0.00851173
10    4061      0.00744584
11    34822     0.00735232
12    34796     0.00661284
13    34811     0.00616502
14    34801     0.00527441
15    16419     0.00519801
16    34834     0.00483077
17    34806     0.00449070
18    34857     0.00412312
19    34797     0.00411723
20    3456      0.00365510
21    34813     0.00306486
22    34849     0.00268919
23    34983     0.00256511
24    34815     0.00240557
25    34839     0.00217613
26    34804     0.00214872
27    34874     0.00139754
28    4431      0.00131618
29    34853     0.00130910
30    34829     0.00129574
31    34845     0.00122954
32    34803     0.00108836
33    34861     0.00

# Saving to CSV files

## Comments

In [8]:
top_1000 = comments_user_ranks[:1000]
top_1000_data = []
for rank, (idx, score) in enumerate(top_1000, start=1):
    user_id = comments.vs[idx]['name']
    top_1000_data.append({'Rank': rank, '博主id': user_id, 'score': score})
top_1000_df = pd.DataFrame(top_1000_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

top_1000_df['博主id'] = top_1000_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = top_1000_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

missing_users = merged_df[merged_df['用户昵称'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]

final_df.to_csv("Pagerank_results/top_1000_influential_commenters.csv", index=False, encoding='utf-8-sig')
print("Saved: Pagerank_results/top_1000_influential_commenters.csv")

Users not found in user.csv: 0
Saved: Pagerank_results/top_1000_influential_commenters.csv


## Likes

In [9]:
top_1000 = likes_user_ranks[:1000]
top_1000_data = []
for rank, (idx, score) in enumerate(top_1000, start=1):
    user_id = likes.vs[idx]['name']
    top_1000_data.append({'Rank': rank, '博主id': user_id, 'score': score})
top_1000_df = pd.DataFrame(top_1000_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

top_1000_df['博主id'] = top_1000_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = top_1000_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

missing_users = merged_df[merged_df['用户昵称'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]

final_df.to_csv("Pagerank_results/top_1000_influential_likers.csv", index=False, encoding='utf-8-sig')
print("Saved: Pagerank_results/top_1000_influential_likers.csv")

Users not found in user.csv: 0
Saved: Pagerank_results/top_1000_influential_likers.csv


## Reposts

In [10]:
top_1000 = reposts_user_ranks[:1000]
top_1000_data = []
for rank, (idx, score) in enumerate(top_1000, start=1):
    user_id = reposts.vs[idx]['name']
    top_1000_data.append({'Rank': rank, '博主id': user_id, 'score': score})
top_1000_df = pd.DataFrame(top_1000_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

top_1000_df['博主id'] = top_1000_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = top_1000_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

missing_users = merged_df[merged_df['用户昵称'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]

final_df.to_csv("Pagerank_results/top_1000_influential_reposters.csv", index=False, encoding='utf-8-sig')
print("Saved: Pagerank_results/top_1000_influential_reposters.csv")

Users not found in user.csv: 0
Saved: Pagerank_results/top_1000_influential_reposters.csv


# Done!