In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations
import random
from tqdm import tqdm

# Define the file path
file_path = 'audio_editors.csv'

# Initialize an empty graph
G1 = nx.Graph()

# Define a dictionary to map editor names to nodes in the graph
editor_to_node = {}

def get_or_create_node(editor):
    if editor not in editor_to_node:
        # Create a new node for each unique editor
        node_id = len(editor_to_node)  # Use the current dictionary length as the node ID
        editor_to_node[editor] = node_id
        G1.add_node(node_id)  # Add node to the graph
    return editor_to_node[editor]

# Define a function to process edges
def process_edges(df):
    edges_to_add = []
    for _, group in df.groupby('item'):
        editors = group['editor'].dropna().unique()
        if len(editors) > 1:
            editor_nodes = [get_or_create_node(editor) for editor in editors]
            editor_combinations = combinations(editor_nodes, 2)
            for edge in editor_combinations:
                if edge[0] is not None and edge[1] is not None:
                    edges_to_add.append(edge)
    return edges_to_add

# Set chunk size
chunksize = 10000

# Read data, process in chunks
for chunk in tqdm(pd.read_csv(file_path, dtype={'item': str, 'editor': str}, chunksize=chunksize), desc="Processing chunks"):
    chunk = chunk[chunk['editor'].notna()]  # Remove entries with no editor
    chunk = chunk[chunk['editor'].str.strip() != '']  # Remove entries where editor is an empty string
    edges = process_edges(chunk)
    if edges:
        G1.add_edges_from(edges)

# Calculate network statistics
number_of_nodes = G1.number_of_nodes()
number_of_edges = G1.number_of_edges()

# Sample nodes to estimate statistics
sample_size = int(0.1 * number_of_nodes)  # For example, sample 10% of nodes
sampled_nodes = random.sample(G1.nodes(), sample_size)

# Calculate average degree (sampled)
average_degree = sum(dict(G1.degree(sampled_nodes)).values()) / sample_size

# Calculate average clustering coefficient (sampled)
average_clustering_coefficient = nx.average_clustering(G1, nodes=sampled_nodes)

# Create a sampled subgraph to calculate diameter and average path length
sampled_subgraph = G1.subgraph(sampled_nodes)
if nx.is_connected(sampled_subgraph):
    average_diameter = nx.diameter(sampled_subgraph)
    average_path_length = nx.average_shortest_path_length(sampled_subgraph)
else:
    # Handle the case of a disconnected subgraph
    largest_cc = max(nx.connected_components(sampled_subgraph), key=len)
    largest_subgraph = G1.subgraph(largest_cc)
    average_diameter = nx.diameter(largest_subgraph)
    average_path_length = nx.average_shortest_path_length(largest_subgraph)

# Print overall results
print(f"Total number of nodes: {number_of_nodes}")
print(f"Total number of edges: {number_of_edges}")
print(f"Average degree (sampled): {average_degree}")
print(f"Average clustering coefficient (sampled): {average_clustering_coefficient}")
print(f"Average diameter (sampled): {average_diameter}")
print(f"Average path length (sampled): {average_path_length}")

Processing chunks: 100it [00:35,  2.80it/s]
since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes = random.sample(G1.nodes(), sample_size)


Total number of nodes: 27883
Total number of edges: 4841085
Average degree (sampled): 341.47991391678624
Average clustering coefficient (sampled): 0.8739072123657186
Average diameter (sampled): 6
Average path length (sampled): 2.2182680633654766


In [None]:
import networkx as nx
import community.community_louvain as cl

# Use the Louvain algorithm to find the optimal community division of the network
partition = cl.best_partition(G1)

# Calculate the number of communities
number_of_communities = len(set(partition.values()))

# Calculate modularity
modularity = cl.modularity(partition, G1)

# Output basic information about the communities
print(f"Number of communities detected: {number_of_communities}")
print(f"Modularity of the partition: {modularity:.4f}")

Number of communities detected: 11
Modularity of the partition: 0.2366


In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations
import random
from tqdm import tqdm

# Define the file path
file_path = 'video_editors.csv'

# Initialize an empty network graph
G2 = nx.Graph()

# Define a dictionary to map editor names to graph nodes
editor_to_node = {}

def get_or_create_node(editor):
    if editor not in editor_to_node:
        # Create a new node for each unique editor
        node_id = len(editor_to_node)  # Use the current dictionary length as the node ID
        editor_to_node[editor] = node_id
        G2.add_node(node_id)  # Add the node to the graph
    return editor_to_node[editor]

# Define a function to process edges
def process_edges(df):
    edges_to_add = []
    for _, group in df.groupby('item'):
        editors = group['editor'].dropna().unique()
        if len(editors) > 1:
            editor_nodes = [get_or_create_node(editor) for editor in editors]
            editor_combinations = combinations(editor_nodes, 2)
            for edge in editor_combinations:
                if edge[0] is not None and edge[1] is not None:
                    edges_to_add.append(edge)
    return edges_to_add

# Set the chunk size
chunksize = 10000

# Read and process data in chunks
for chunk in tqdm(pd.read_csv(file_path, dtype={'item': str, 'editor': str}, chunksize=chunksize), desc="Processing chunks"):
    chunk = chunk[chunk['editor'].notna()]  # Remove empty editors
    chunk = chunk[chunk['editor'].str.strip() != '']  # Remove blank string editors
    edges = process_edges(chunk)
    if edges:
        G2.add_edges_from(edges)

# Calculate network statistics
number_of_nodes = G2.number_of_nodes()
number_of_edges = G2.number_of_edges()

# Sample nodes to estimate statistics
sample_size = int(0.02 * number_of_nodes)  # For example, take 2% of nodes for sampling
sampled_nodes = random.sample(G2.nodes(), sample_size)

# Calculate average degree (sampled)
average_degree = sum(dict(G2.degree(sampled_nodes)).values()) / sample_size

# Calculate average clustering coefficient (sampled)
average_clustering_coefficient = nx.average_clustering(G2, nodes=sampled_nodes)

# Create a sampled subgraph for calculating diameter and average path length
sampled_subgraph = G2.subgraph(sampled_nodes)
if nx.is_connected(sampled_subgraph):
    average_diameter = nx.diameter(sampled_subgraph)
    average_path_length = nx.average_shortest_path_length(sampled_subgraph)
else:
    # Handle the case of a non-connected subgraph
    largest_cc = max(nx.connected_components(sampled_subgraph), key=len)
    largest_subgraph = G2.subgraph(largest_cc)
    average_diameter = nx.diameter(largest_subgraph)
    average_path_length = nx.average_shortest_path_length(largest_subgraph)

# Print overall results
print(f"Total number of nodes: {number_of_nodes}")
print(f"Total number of edges: {number_of_edges}")
print(f"Average degree : {average_degree}")
print(f"Average clustering coefficient : {average_clustering_coefficient}")
print(f"Average diameter : {average_diameter}")
print(f"Average path length : {average_path_length}")


Processing chunks: 197it [01:42,  1.92it/s]
since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes = random.sample(G2.nodes(), sample_size)


Total number of nodes: 68741
Total number of edges: 14282127
Average degree : 427.2430858806405
Average clustering coefficient : 0.8909310711687697
Average diameter : 4
Average path length : 2.253940152365607


In [None]:
import networkx as nx
import community.community_louvain as cl

# Use the Louvain algorithm to find the optimal community partition of the network
partition = cl.best_partition(G2)

# Calculate the number of communities
number_of_communities = len(set(partition.values()))

# Calculate modularity
modularity = cl.modularity(partition, G2)

# Output basic information about the communities
print(f"Number of communities detected: {number_of_communities}")
print(f"Modularity of the partition: {modularity:.4f}")

Number of communities detected: 14
Modularity of the partition: 0.2041


In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations
import random
from tqdm import tqdm

# Define the file path
file_path = 'image_editors.csv'

# Initialize an empty network graph
G3 = nx.Graph()

# Define a dictionary to map editor names to graph nodes
editor_to_node = {}

def get_or_create_node(editor):
    if editor not in editor_to_node:
        # Create a new node for each unique editor
        node_id = len(editor_to_node)  # Use the current dictionary length as the node ID
        editor_to_node[editor] = node_id
        G3.add_node(node_id)  # Add the node to the graph
    return editor_to_node[editor]

# Define a function to process edges
def process_edges(df):
    edges_to_add = []
    for _, group in df.groupby('item'):
        editors = group['editor'].dropna().unique()
        if len(editors) > 1:
            editor_nodes = [get_or_create_node(editor) for editor in editors]
            editor_combinations = combinations(editor_nodes, 2)
            for edge in editor_combinations:
                if edge[0] is not None and edge[1] is not None:
                    edges_to_add.append(edge)
    return edges_to_add

# Set the chunk size
chunksize = 10000

# Read and process data in chunks
for chunk in tqdm(pd.read_csv(file_path, dtype={'item': str, 'editor': str}, chunksize=chunksize), desc="Processing chunks"):
    chunk = chunk[chunk['editor'].notna()]  # Remove empty editors
    chunk = chunk[chunk['editor'].str.strip() != '']  # Remove blank string editors
    edges = process_edges(chunk)
    if edges:
        G3.add_edges_from(edges)

# Calculate network statistics
number_of_nodes = G3.number_of_nodes()
number_of_edges = G3.number_of_edges()

# Sample nodes to estimate statistics
sample_size = int(0.03 * number_of_nodes)  # For example, take 2% of nodes for sampling
sampled_nodes = random.sample(G3.nodes(), sample_size)

# Calculate average degree (sampled)
average_degree = sum(dict(G3.degree(sampled_nodes)).values()) / sample_size

# Calculate average clustering coefficient (sampled)
average_clustering_coefficient = nx.average_clustering(G3, nodes=sampled_nodes)

# Create a sampled subgraph for calculating diameter and average path length
sampled_subgraph = G3.subgraph(sampled_nodes)
if nx.is_connected(sampled_subgraph):
    average_diameter = nx.diameter(sampled_subgraph)
    average_path_length = nx.average_shortest_path_length(sampled_subgraph)
else:
    # Handle the case of a non-connected subgraph
    largest_cc = max(nx.connected_components(sampled_subgraph), key=len)
    largest_subgraph = G3.subgraph(largest_cc)
    average_diameter = nx.diameter(largest_subgraph)
    average_path_length = nx.average_shortest_path_length(largest_subgraph)

# Print overall results
print(f"Total number of nodes: {number_of_nodes}")
print(f"Total number of edges: {number_of_edges}")
print(f"Average degree: {average_degree}")
print(f"Average clustering coefficient: {average_clustering_coefficient}")
print(f"Average diameter: {average_diameter}")
print(f"Average path length: {average_path_length}")


Processing chunks: 744it [05:43,  2.17it/s]
since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes = random.sample(G3.nodes(), sample_size)


Total number of nodes: 125705
Total number of edges: 26219143
Average degree: 408.2110845929462
Average clustering coefficient: 0.9158438432363202
Average diameter: 5
Average path length: 2.415933978817237


In [None]:
import networkx as nx
import community.community_louvain as cl

# Use the Louvain algorithm to find the optimal community partition of the network
partition = cl.best_partition(G3)

# Calculate the number of communities
number_of_communities = len(set(partition.values()))

# Calculate modularity
modularity = cl.modularity(partition, G3)

# Output basic information about the communities
print(f"Number of communities detected: {number_of_communities}")
print(f"Modularity of the partition: {modularity:.4f}")

Number of communities detected: 17
Modularity of the partition: 0.2041


In [2]:
import networkx as nx
import random

# Define the number of nodes and the connection probability
num_nodes = 27883
connection_prob = 0.01

# Generate a random network using NetworkX and rename it to R1
R1 = nx.erdos_renyi_graph(n=num_nodes, p=connection_prob)

# Calculate network statistics
number_of_nodes_r1 = R1.number_of_nodes()
number_of_edges_r1 = R1.number_of_edges()

# Sample nodes to estimate statistics
sample_size_r1 = int(0.1 * number_of_nodes_r1)  # For example, sample 10% of nodes
sampled_nodes_r1 = random.sample(R1.nodes(), sample_size_r1)

# Calculate average degree (sampled)
average_degree_r1 = sum(dict(R1.degree(sampled_nodes_r1)).values()) / sample_size_r1

# Calculate average clustering coefficient (sampled)
average_clustering_coefficient_r1 = nx.average_clustering(R1, nodes=sampled_nodes_r1)

# Create a sampled subgraph to calculate diameter and average path length
sampled_subgraph_r1 = R1.subgraph(sampled_nodes_r1)
if nx.is_connected(sampled_subgraph_r1):
    average_diameter_r1 = nx.diameter(sampled_subgraph_r1)
    average_path_length_r1 = nx.average_shortest_path_length(sampled_subgraph_r1)
else:
    # Handle the case of a disconnected subgraph
    largest_cc_r1 = max(nx.connected_components(sampled_subgraph_r1), key=len)
    largest_subgraph_r1 = R1.subgraph(largest_cc_r1)
    average_diameter_r1 = nx.diameter(largest_subgraph_r1)
    average_path_length_r1 = nx.average_shortest_path_length(largest_subgraph_r1)

# Print overall results
print(f"Total number of nodes (R1): {number_of_nodes_r1}")
print(f"Total number of edges (R1): {number_of_edges_r1}")
print(f"Average degree : {average_degree_r1}")
print(f"Average clustering coefficient : {average_clustering_coefficient_r1}")
print(f"Average diameter : {average_diameter_r1}")
print(f"Average path length : {average_path_length_r1}")

since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes_r1 = random.sample(R1.nodes(), sample_size_r1)


Total number of nodes (R1): 27883
Total number of edges (R1): 3885287
Average degree : 278.8629842180775
Average clustering coefficient : 0.009994550133255039
Average diameter : 4
Average path length : 2.7417619414590906


In [3]:
import networkx as nx
import community.community_louvain as cl

# Use the Louvain algorithm to find the optimal community partition of the network
partition = cl.best_partition(R1)

# Calculate the number of communities
number_of_communities = len(set(partition.values()))

# Calculate modularity
modularity = cl.modularity(partition, R1)

# Output basic information about the communities
print(f"Number of communities detected: {number_of_communities}")
print(f"Modularity of the partition: {modularity:.4f}")

Number of communities detected: 7
Modularity of the partition: 0.0527


In [4]:
import networkx as nx
import random

# Define the number of nodes and the connection probability
num_nodes = 68741
connection_prob = 0.01

# Generate a random network using NetworkX and rename it to R2
R2 = nx.erdos_renyi_graph(n=num_nodes, p=connection_prob)

# Calculate network statistics
number_of_nodes_r2 = R2.number_of_nodes()
number_of_edges_r2 = R2.number_of_edges()

# Sample nodes to estimate statistics
sample_size_r2 = int(0.03 * number_of_nodes_r2)  # For example, sample 3% of nodes
sampled_nodes_r2 = random.sample(R2.nodes(), sample_size_r2)

# Calculate average degree (sampled)
average_degree_r2 = sum(dict(R2.degree(sampled_nodes_r2)).values()) / sample_size_r2

# Calculate average clustering coefficient (sampled)
average_clustering_coefficient_r2 = nx.average_clustering(R2, nodes=sampled_nodes_r2)

# Create a sampled subgraph to calculate diameter and average path length
sampled_subgraph_r2 = R2.subgraph(sampled_nodes_r2)
if nx.is_connected(sampled_subgraph_r2):
    average_diameter_r2 = nx.diameter(sampled_subgraph_r2)
    average_path_length_r2 = nx.average_shortest_path_length(sampled_subgraph_r2)
else:
    # Handle the case of a disconnected subgraph
    largest_cc_r2 = max(nx.connected_components(sampled_subgraph_r2), key=len)
    largest_subgraph_r2 = R2.subgraph(largest_cc_r2)
    average_diameter_r2 = nx.diameter(largest_subgraph_r2)
    average_path_length_r2 = nx.average_shortest_path_length(largest_subgraph_r2)

# Print overall results
print(f"Total number of nodes (R2): {number_of_nodes_r2}")
print(f"Total number of edges (R2): {number_of_edges_r2}")
print(f"Average degree: {average_degree_r2}")
print(f"Average clustering coefficient: {average_clustering_coefficient_r2}")
print(f"Average diameter: {average_diameter_r2}")
print(f"Average path length: {average_path_length_r2}")

since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes_r2 = random.sample(R2.nodes(), sample_size_r2)


Total number of nodes (R2): 68741
Total number of edges (R2): 23623394
Average degree : 686.8850630455868
Average clustering coefficient : 0.010001502686752659
Average diameter : 4
Average path length : 2.8185892829326304


In [6]:
import networkx as nx
import community.community_louvain as cl

# Use the Louvain algorithm to find the optimal community partition of the network
partition = cl.best_partition(R2)

# Calculate the number of communities
number_of_communities = len(set(partition.values()))

# Calculate modularity
modularity = cl.modularity(partition, R2)

# Output basic information about the communities
print(f"Number of communities detected: {number_of_communities}")
print(f"Modularity of the partition: {modularity:.4f}")

Number of communities detected: 8
Modularity of the partition: 0.0326


In [1]:
import networkx as nx
import random

# Define the number of nodes and the connection probability
num_nodes = 125705
connection_prob = 0.01

# Generate a random network using NetworkX and rename it to R3
R3 = nx.erdos_renyi_graph(n=num_nodes, p=connection_prob)

# Calculate network statistics
number_of_nodes_r3 = R3.number_of_nodes()
number_of_edges_r3 = R3.number_of_edges()

# Sample nodes to estimate statistics
sample_size_r3 = int(0.02 * number_of_nodes_r3)  # For example, sample 2% of nodes
sampled_nodes_r3 = random.sample(list(R3.nodes()), sample_size_r3)

# Calculate average degree (sampled)
average_degree_r3 = sum(dict(R3.degree(sampled_nodes_r3)).values()) / sample_size_r3

# Calculate average clustering coefficient (sampled)
average_clustering_coefficient_r3 = nx.average_clustering(R3, nodes=sampled_nodes_r3)

# Create a sampled subgraph to calculate diameter and average path length
sampled_subgraph_r3 = R3.subgraph(sampled_nodes_r3)
if nx.is_connected(sampled_subgraph_r3):
    average_diameter_r3 = nx.diameter(sampled_subgraph_r3)
    average_path_length_r3 = nx.average_shortest_path_length(sampled_subgraph_r3)
else:
    # Handle the case of a disconnected subgraph
    largest_cc_r3 = max(nx.connected_components(sampled_subgraph_r3), key=len)
    largest_subgraph_r3 = R3.subgraph(largest_cc_r3)
    average_diameter_r3 = nx.diameter(largest_subgraph_r3)
    average_path_length_r3 = nx.average_shortest_path_length(largest_subgraph_r3)

# Print overall results
print(f"Total number of nodes (R3): {number_of_nodes_r3}")
print(f"Total number of edges (R3): {number_of_edges_r3}")
print(f"Average degree: {average_degree_r3}")
print(f"Average clustering coefficient: {average_clustering_coefficient_r3}")
print(f"Average diameter: {average_diameter_r3}")
print(f"Average path length: {average_path_length_r3}")

Total number of nodes (R3): 125705
Total number of edges (R3): 78989330
Average degree : 1256.0373906125697
Average clustering coefficient : 0.009998207635059845
Average diameter : 4
Average path length : 2.7698902857092205


In [2]:
import networkx as nx
import community.community_louvain as cl

# Use the Louvain algorithm to find the optimal community partition of the network
partition = cl.best_partition(R3)

# Calculate the number of communities
number_of_communities = len(set(partition.values()))

# Calculate modularity
modularity = cl.modularity(partition, R3)

# Output basic information about the communities
print(f"Number of communities detected: {number_of_communities}")
print(f"Modularity of the partition: {modularity:.4f}")

Number of communities detected: 7
Modularity of the partition: 0.0245
