# Indegree

## Comments

In [1]:
import igraph as ig
from igraph import Graph
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Data/comments.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '评论者id'])

df['博主id'] = df['博主id'].astype(str)
df['评论者id'] = df['评论者id'].astype(str)

edge_weights = df.groupby(['评论者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['评论者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['评论者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

comments = Graph(directed=True)
comments.add_vertices(len(nodes))
comments.add_edges(edges)
comments.es['weight'] = weights
comments.vs['name'] = nodes

print(comments.summary())

IGRAPH DNW- 20433 22346 -- 
+ attr: name (v), weight (e)


In [3]:
in_degrees = comments.indegree()

comments_user_ranks_indegree = sorted(enumerate(in_degrees), key=lambda x: x[1], reverse=True)

print("Top 1000 Most Active Commenters (by In-Degree):")
print(f"{'Rank':<6}{'User ID':<10}{'In-Degree':>10}")
print("-" * 46)
for rank, (idx, score) in enumerate(comments_user_ranks_indegree[:1000], start=1):
    user_id = comments.vs[idx]['name']
    print(f"{rank:<6}{user_id:<10}{score:>10}")

Top 1000 Most Active Commenters (by In-Degree):
Rank  User ID    In-Degree
----------------------------------------------
1     6164052686      2464
2     6276227682      2101
3     6365990425      2000
4     1774057271       909
5     3910809787       908
6     1314608344       907
7     1655444627       906
8     1463503853       762
9     1699432410       544
10    6651577358       423
11    1223812162       321
12    6986794157       260
13    7227825708       259
14    1706596590       244
15    1700832492       214
16    5609184218       209
17    5284320379       200
18    1885454921       197
19    6038922387       190
20    7787554663       187
21    5851027596       186
22    2107493602       179
23    7331622139       171
24    2792519675       167
25    5032932021       167
26    7755867260       161
27    1403078180       156
28    3141296113       151
29    1411163204       135
30    6569466125       133
31    7670162388       132
32    7002084904       121
33    63372738

## Likes

In [4]:
df = pd.read_csv("Data/likes.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '点赞者id'])

df['博主id'] = df['博主id'].astype(str)
df['点赞者id'] = df['点赞者id'].astype(str)

edge_weights = df.groupby(['点赞者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['点赞者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['点赞者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

likes = Graph(directed=True)
likes.add_vertices(len(nodes))
likes.add_edges(edges)
likes.es['weight'] = weights
likes.vs['name'] = nodes

print(likes.summary())

IGRAPH DNW- 59904 72199 -- 
+ attr: name (v), weight (e)


In [5]:
in_degrees = likes.indegree()

likes_user_ranks_indegree = sorted(enumerate(in_degrees), key=lambda x: x[1], reverse=True)

print("Top 1000 Most Active Likers (by In-Degree):")
print(f"{'Rank':<6}{'User ID':<10}{'In-Degree':>10}")
print("-" * 46)
for rank, (idx, score) in enumerate(likes_user_ranks_indegree[:1000], start=1):
    user_id = likes.vs[idx]['name']
    print(f"{rank:<6}{user_id:<10}{score:>10}")

Top 1000 Most Active Likers (by In-Degree):
Rank  User ID    In-Degree
----------------------------------------------
1     1655444627      1330
2     7264493800      1283
3     5609184218      1000
4     1314608344       999
5     3306212485       998
6     1223812162       997
7     6276227682       995
8     1774057271       993
9     7567651111       930
10    2599051311       855
11    6444551595       798
12    1933903170       746
13    7548920421       744
14    2458743132       699
15    7396956580       624
16    1887344341       551
17    1463503853       500
18    2844644700       500
19    5862006754       500
20    6033073678       500
21    7055357333       500
22    7652377360       500
23    7773204980       500
24    7799806116       500
25    7809335092       500
26    7813929958       500
27    7843070496       500
28    6619742346       500
29    1867099473       500
30    5985928016       500
31    2172631523       500
32    6164052686       500
33    7872698952  

## Reposts

In [6]:
df = pd.read_csv("Data/reposts.csv")

# Drop rows with missing IDs
df = df.dropna(subset=['博主id', '转发者id'])

df['博主id'] = df['博主id'].astype(str)
df['转发者id'] = df['转发者id'].astype(str)

edge_weights = df.groupby(['转发者id', '博主id']).size().reset_index(name='weight')

# Create a list of unique node IDs
nodes = pd.Index(pd.concat([edge_weights['转发者id'], edge_weights['博主id']])).unique().tolist()

# Map node IDs to integer indices
id_to_index = {node_id: idx for idx, node_id in enumerate(nodes)}

edges = [(id_to_index[row['转发者id']], id_to_index[row['博主id']]) for _, row in edge_weights.iterrows()]
weights = edge_weights['weight'].tolist()

reposts = Graph(directed=True)
reposts.add_vertices(len(nodes))
reposts.add_edges(edges)
reposts.es['weight'] = weights
reposts.vs['name'] = nodes

print(reposts.summary())

IGRAPH DNW- 35051 38242 -- 
+ attr: name (v), weight (e)


In [7]:
in_degrees = reposts.indegree()

reposts_user_ranks_indegree = sorted(enumerate(in_degrees), key=lambda x: x[1], reverse=True)

print("Top 1000 Most Active Reposters (by In-Degree):")
print(f"{'Rank':<6}{'User ID':<10}{'In-Degree':>10}")
print("-" * 46)
for rank, (idx, score) in enumerate(reposts_user_ranks_indegree[:1000], start=1):
    user_id = reposts.vs[idx]['name']
    print(f"{rank:<6}{user_id:<10}{score:>10}")

Top 1000 Most Active Reposters (by In-Degree):
Rank  User ID    In-Degree
----------------------------------------------
1     6365990425      8463
2     3306212485      7573
3     6276227682      3121
4     6164052686      2518
5     7227825708      1681
6     1706596590      1473
7     1314608344      1242
8     1223812162       977
9     1774057271       821
10    1655444627       739
11    1699432410       601
12    1762923331       540
13    6038922387       489
14    6651577358       420
15    2599051311       406
16    1463503853       400
17    3141296113       365
18    2792519675       342
19    1700832492       231
20    5284320379       224
21    5609184218       223
22    3910809787       209
23    6168519077       209
24    1933903170       202
25    7558234208       138
26    2858270197       120
27    2048978227       118
28    1819746910       118
29    6569466125       116
30    1411163204       105
31    7002084904       100
32    1887312183        92
33    228609211

# Saving to CSV files

## Comments

In [14]:
top_1000 = comments_user_ranks_indegree[:1000]
top_1000_data = []
for rank, (idx, score) in enumerate(top_1000, start=1):
    user_id = comments.vs[idx]['name']
    top_1000_data.append({'Rank': rank, '博主id': user_id, 'score': score})
top_1000_df = pd.DataFrame(top_1000_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

top_1000_df['博主id'] = top_1000_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = top_1000_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

missing_users = merged_df[merged_df['用户昵称'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]

final_df.to_csv("Indegree_results/top_1000_influential_commenters.csv", index=False, encoding='utf-8-sig')
print("Saved: Indegree_results/top_1000_influential_commenters.csv")

Users not found in user.csv: 0
Saved: Indegree_results/top_1000_influential_commenters.csv


## Likes

In [12]:
top_1000 = likes_user_ranks_indegree[:1000]
top_1000_data = []
for rank, (idx, score) in enumerate(top_1000, start=1):
    user_id = likes.vs[idx]['name']
    top_1000_data.append({'Rank': rank, '博主id': user_id, 'score': score})
top_1000_df = pd.DataFrame(top_1000_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

top_1000_df['博主id'] = top_1000_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = top_1000_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

missing_users = merged_df[merged_df['用户昵称'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]

final_df.to_csv("Indegree_results/top_1000_influential_likers.csv", index=False, encoding='utf-8-sig')
print("Saved: Indegree_results/top_1000_influential_likers.csv")

Users not found in user.csv: 0
Saved: Indegree_results/top_1000_influential_likers.csv


## Reposts

In [13]:
top_1000 = reposts_user_ranks_indegree[:1000]
top_1000_data = []
for rank, (idx, score) in enumerate(top_1000, start=1):
    user_id = reposts.vs[idx]['name']
    top_1000_data.append({'Rank': rank, '博主id': user_id, 'score': score})
top_1000_df = pd.DataFrame(top_1000_data)

user_columns = [
    '用户id', '用户昵称', '性别', '粉丝数', '关注数',
    '是否大V', '微博认证', '个人简介', 'svip等级'
]

user_df = pd.read_csv(
    "Data/user.csv",
    dtype={'用户id': str},
    low_memory=False
)

user_df = user_df.drop_duplicates(subset='用户id', keep='first')

top_1000_df['博主id'] = top_1000_df['博主id'].astype(str).str.strip()
user_df['用户id'] = user_df['用户id'].astype(str).str.strip()

merged_df = top_1000_df.merge(user_df, left_on='博主id', right_on='用户id', how='left')

missing_users = merged_df[merged_df['用户昵称'].isnull()]
print(f"Users not found in user.csv: {len(missing_users)}")
if not missing_users.empty:
    print(missing_users[['博主id', 'score']].head())

columns = ['Rank', '博主id', '用户昵称', '性别', '粉丝数', '关注数', '是否大V', '微博认证', '个人简介', 'svip等级',
           'score']
final_df = merged_df[columns]

final_df.to_csv("Indegree_results/top_1000_influential_reposters.csv", index=False, encoding='utf-8-sig')
print("Saved: Indegree_results/top_1000_influential_reposters.csv")

Users not found in user.csv: 0
Saved: Indegree_results/top_1000_influential_reposters.csv


# Done!