In [1]:

import json
import numpy as np
import networkx as nx
import pandas as pd
from community import community_louvain
import netwulf as nw

class DataProcessor:
    @staticmethod
    def sanitize_value(value):
        """确保所有数值符合JSON规范"""
        try:
            num = float(value)
            if np.isinf(num):
                return 1e308 if num > 0 else -1e308
            if np.isnan(num):
                return 0.0
            return round(num, 6)
        except:
            return value

    @staticmethod
    def load_network(filepath):
        """加载并清洗网络数据"""
        with open(filepath) as f:
            raw_data = json.load(f)
        
        # 清洗节点数据
        nodes = [{
            'id': str(node.get('id', hash(json.dumps(node, sort_keys=True)))),
            **{k: DataProcessor.sanitize_value(v) for k, v in node.items() if k != 'id'}
        } for node in raw_data.get('nodes', [])]
        
        # 清洗边数据
        edges = []
        for edge in raw_data.get('edges', raw_data.get('links', [])):
            source = str(edge.get('source', ''))
            target = str(edge.get('target', ''))
            if source and target:
                edges.append({
                    'source': source,
                    'target': target,
                    'weight': DataProcessor.sanitize_value(edge.get('weight', 1.0))
                })
        
        return nodes, edges

def build_network(nodes, edges):
    """构建网络图"""
    G = nx.Graph()
    
    # 添加节点
    node_ids = {node['id'] for node in nodes}
    for node in nodes:
        G.add_node(node['id'], **{k:v for k,v in node.items() if k != 'id'})
    
    # 添加边
    for edge in edges:
        if edge['source'] in node_ids and edge['target'] in node_ids:
            G.add_edge(edge['source'], edge['target'], weight=edge['weight'])
    
    return G

# 主程序
try:
    # 1. 加载数据
    nodes, edges = DataProcessor.load_network(
        r'C:\Users\昳澄\Desktop\course\DTU exchange\computational social science\讲座\Computational_Social_Scientists_Network.json'
    )
    
    # 2. 构建网络
    G = build_network(nodes, edges)
    print(f"Network contains {G.number_of_nodes()} authors and {G.number_of_edges()} collaborations")
    
    # 3. 社区检测
    partition = community_louvain.best_partition(G)
    modularity = community_louvain.modularity(partition, G)
    
    # 4. 计算每个作者的度数
    degrees = dict(G.degree())
    
    # 5. 结果分析
    community_df = pd.DataFrame({
        'Author': partition.keys(),
        'Community': partition.values(),
        'Degree': [degrees[author] for author in partition.keys()]
    })
    
    # 按度数降序排列
    community_df = community_df.sort_values('Degree', ascending=False)
    
    # 社区规模统计
    size_distribution = community_df['Community'].value_counts()
    
    print("\n=== Analysis Results ===")
    print(f"Number of communities: {len(size_distribution)}")
    print(f"Community size distribution:\n{size_distribution.describe()}")
    print(f"Modularity: {modularity:.3f} ({'Significant' if modularity > 0.3 else 'Insignificant'} community structure)")
    
    # 输出度数统计信息
    print("\n=== Degree Statistics ===")
    print(f"Average degree: {community_df['Degree'].mean():.2f}")
    print(f"Max degree: {community_df['Degree'].max()}")
    print(f"Top 5 authors by degree:")
    for idx, row in community_df.head(5).iterrows():
        print(f"  Author: {row['Author']}, Degree: {row['Degree']}, Community: {row['Community']}")
    
    # 6. 可视化 - 修正参数问题
    # 节点大小基于度数
    nw.visualize(G, config={
        'node': {
            'color': {
                'type': 'categorical',
                'data': list(partition.values()),
                'palette': 'tab20'
            },
            'size': {
                'type': 'numerical',
                'data': [min(20, 3 + degrees[n]/5) for n in G.nodes()],  # 将度数映射到合理的节点大小
                'range': [3, 20]
            }
        },
        'zoom': 0.85,
        'physics': {
            'barnesHut': {
                'gravitationalConstant': -2000,
                'springLength': 100,
                'springConstant': 0.04
            }
        }
    })
    
    # 7. 保存结果
    community_df.to_csv('author_communities_with_degree.csv', index=False)
    print("\nResults saved to author_communities_with_degree.csv")
    
    # 8. 打印最大社区的规模分布
    top_communities = size_distribution.sort_values(ascending=False).head(10)
    print("\nTop 10 community sizes:")
    for i, (comm_id, size) in enumerate(top_communities.items()):
        print(f"Community {comm_id}: {size} members")
        
    # 9. 度数分布统计
    degree_counts = community_df['Degree'].value_counts().sort_index()
    print("\nDegree distribution summary:")
    bins = [0, 1, 5, 10, 20, 50, 100, float('inf')]
    bin_labels = ['0', '1-4', '5-9', '10-19', '20-49', '50-99', '100+']
    degree_binned = pd.cut(community_df['Degree'], bins=bins, labels=bin_labels)
    print(degree_binned.value_counts().sort_index())

except Exception as e:
    print(f"Error: {str(e)}")
    print("Troubleshooting steps:")
    print("1. Verify JSON file contains both 'nodes' and 'edges'")
    print("2. Check sample data with:")
    print("import json; data=json.load(open('YOUR_FILE.json')); print('Keys:', data.keys())")


Network contains 14920 authors and 55466 collaborations

=== Analysis Results ===
Number of communities: 170
Community size distribution:
count     170.000000
mean       87.764706
std       139.160853
min         2.000000
25%         9.000000
50%        29.000000
75%       116.750000
max      1130.000000
Name: count, dtype: float64
Modularity: 0.952 (Significant community structure)

=== Degree Statistics ===
Average degree: 7.44
Max degree: 1147
Top 5 authors by degree:
  Author: https://openalex.org/A5059645286, Degree: 1147, Community: 60
  Author: https://openalex.org/A5100749553, Degree: 501, Community: 23
  Author: https://openalex.org/A5100320883, Degree: 431, Community: 50
  Author: https://openalex.org/A5100343550, Degree: 347, Community: 92
  Author: https://openalex.org/A5100435139, Degree: 309, Community: 90


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.



Results saved to author_communities_with_degree.csv

Top 10 community sizes:
Community 60: 1130 members
Community 1: 804 members
Community 23: 488 members
Community 14: 486 members
Community 50: 424 members
Community 0: 368 members
Community 92: 348 members
Community 90: 302 members
Community 104: 302 members
Community 7: 292 members

Degree distribution summary:
Degree
0         339
1-4      6603
5-9      6453
10-19    1088
20-49     327
50-99      60
100+       50
Name: count, dtype: int64
