# Projections of bipartite graphs (3 versions)

In [1]:
import csv
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite

In [2]:
import numpy as np
from collections import defaultdict

In [3]:
nodelist_df = pd.read_csv('wiki_editors_nodelist.csv')
edgelist_df = pd.read_csv('wiki_editors_edgelist.csv')

In [4]:
EDITOR_ID_COL = 'ID'
EDITOR_NAME_COL = 'Label'
EDITS_MADE_COL = 'Edits'
SOURCE_COL = 'Source'
TARGET_COL = 'Target'

In [5]:
# Create bipartite graph
B = nx.Graph()

In [8]:
# Add editor nodes (use 'bipartite=0' to mark them as one set)
editor_nodes = [(row[EDITOR_ID_COL], {'bipartite': 0, 'node_type': 'editor', 'label': row[EDITOR_NAME_COL]}) 
                for _, row in nodelist_df.iterrows()]
B.add_nodes_from(editor_nodes)

In [9]:
# Add page nodes (use 'bipartite=1' to mark them as the other set)
unique_pages = edgelist_df[TARGET_COL].unique()
page_nodes = [(page, {'bipartite': 1, 'node_type': 'page'}) for page in unique_pages]
B.add_nodes_from(page_nodes)

In [10]:
# Add weighted edges between editors and pages
edges_with_weights = [(row[SOURCE_COL], row[TARGET_COL], {'weight': row['weight']}) 
                      for _, row in edgelist_df.iterrows()]
B.add_edges_from(edges_with_weights)

In [12]:
print(f"Bipartite network created:")
print(f"  Total nodes: {B.number_of_nodes()}")
print(f"  Editor nodes: {len(nodelist_df)}")
print(f"  Page nodes: {len(unique_pages)}")
print(f"  Edges: {B.number_of_edges()}")
print(f"  Is bipartite: {bipartite.is_bipartite(B)}")

Bipartite network created:
  Total nodes: 1066
  Editor nodes: 979
  Page nodes: 87
  Edges: 1315
  Is bipartite: True


In [13]:
# Verify the bipartite sets
editor_nodes_set = {n for n, d in B.nodes(data=True) if d['bipartite'] == 0}
page_nodes_set = {n for n, d in B.nodes(data=True) if d['bipartite'] == 1}

In [14]:
print(f"  Editor nodes in bipartite set: {len(editor_nodes_set)}")
print(f"  Page nodes in bipartite set: {len(page_nodes_set)}")

  Editor nodes in bipartite set: 979
  Page nodes in bipartite set: 87


#### Create simple editor projection (unweighted)

In [16]:
# Project onto editor nodes (bipartite=0)
editor_projection_simple = bipartite.projected_graph(B, editor_nodes_set)

In [17]:
print(f"Simple editor projection:")
print(f"  Nodes: {editor_projection_simple.number_of_nodes()}")
print(f"  Edges: {editor_projection_simple.number_of_edges()}")

Simple editor projection:
  Nodes: 979
  Edges: 11680


In [19]:
# Add editor names as node attributes
for node in editor_projection_simple.nodes():
    # Find the editor name from nodelist
    editor_name = nodelist_df[nodelist_df[EDITOR_ID_COL] == node][EDITOR_NAME_COL].iloc[0]
    editor_projection_simple.nodes[node]['label'] = editor_name

In [20]:
# Show some basic network statistics
if editor_projection_simple.number_of_edges() > 0:
    print(f"  Density: {nx.density(editor_projection_simple):.4f}")
    print(f"  Connected components: {nx.number_connected_components(editor_projection_simple)}")
    # Show degree distribution
    degrees = [d for n, d in editor_projection_simple.degree()]
    print(f"  Degree range: {min(degrees)} - {max(degrees)}")
    print(f"  Average degree: {sum(degrees)/len(degrees):.2f}")
    # Show sample edges
    print(f"\nSample connections (editors who edited same pages):")
    sample_edges = list(editor_projection_simple.edges())[:5]
    for edge in sample_edges:
        editor1_name = editor_projection_simple.nodes[edge[0]]['label']
        editor2_name = editor_projection_simple.nodes[edge[1]]['label']
        print(f"  {editor1_name} ↔ {editor2_name}")

  Density: 0.0244
  Connected components: 11
  Degree range: 0 - 365
  Average degree: 23.86

Sample connections (editors who edited same pages):
  Rjensen ↔ Student7
  Rjensen ↔ ClueBot NG
  Rjensen ↔ 98.239.93.41
  Rjensen ↔ J. Finkelstein
  Rjensen ↔ Fenice


#### Create weighted editor projection using Jaccard similarity

You can also calculate Jaccard similarity (with weights only) with networkX inbuilt *overlap_weighted_projected_graph* algorithm with jaccard=True parameter. Here it is done manually to reflect shared pages.
https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.projection.overlap_weighted_projected_graph.html#networkx.algorithms.bipartite.projection.overlap_weighted_projected_graph

In [21]:
# Create weighted projection manually to use Jaccard similarity
editor_projection_weighted = nx.Graph()

In [22]:
# Add all editor nodes
for node in editor_nodes_set:
    editor_name = nodelist_df[nodelist_df[EDITOR_ID_COL] == node][EDITOR_NAME_COL].iloc[0]
    editor_projection_weighted.add_node(node, label=editor_name)

In [23]:
# Get pages edited by each editor from the bipartite graph
editor_pages = {}
for editor in editor_nodes_set:
    # Get all pages this editor is connected to in the bipartite graph
    pages_edited = set(B.neighbors(editor))
    editor_pages[editor] = pages_edited

In [24]:
# Calculate Jaccard similarities between all pairs of editors
editor_list = list(editor_nodes_set)
jaccard_edges = []


In [25]:
for i, editor1 in enumerate(editor_list):
    if i % 50 == 0:  # Progress indicator
        print(f"  Processing editor {i}/{len(editor_list)}")
    
    for editor2 in editor_list[i+1:]:  # Only calculate upper triangle
        pages1 = editor_pages[editor1]
        pages2 = editor_pages[editor2]
        
        # Calculate Jaccard similarity
        intersection = len(pages1 & pages2)
        union = len(pages1 | pages2)
        
        if union > 0 and intersection > 0:  # Only add edges for editors with shared pages
            jaccard_similarity = intersection / union
            
            # Add edge with Jaccard weight and additional info
            editor_projection_weighted.add_edge(editor1, editor2, 
                                               jaccard_weight=jaccard_similarity,
                                               shared_pages=intersection,
                                               total_unique_pages=union)
            jaccard_edges.append((editor1, editor2, jaccard_similarity))

print(f"\nWeighted editor projection (Jaccard similarity):")
print(f"  Nodes: {editor_projection_weighted.number_of_nodes()}")
print(f"  Edges: {editor_projection_weighted.number_of_edges()}")

  Processing editor 0/979
  Processing editor 50/979
  Processing editor 100/979
  Processing editor 150/979
  Processing editor 200/979
  Processing editor 250/979
  Processing editor 300/979
  Processing editor 350/979
  Processing editor 400/979
  Processing editor 450/979
  Processing editor 500/979
  Processing editor 550/979
  Processing editor 600/979
  Processing editor 650/979
  Processing editor 700/979
  Processing editor 750/979
  Processing editor 800/979
  Processing editor 850/979
  Processing editor 900/979
  Processing editor 950/979

Weighted editor projection (Jaccard similarity):
  Nodes: 979
  Edges: 11680


In [26]:
if len(jaccard_edges) > 0:
    jaccard_weights = [weight for _, _, weight in jaccard_edges]
    print(f"  Jaccard weight range: {min(jaccard_weights):.4f} - {max(jaccard_weights):.4f}")
    print(f"  Average Jaccard weight: {sum(jaccard_weights)/len(jaccard_weights):.4f}")
    
    # Show highest similarity pairs
    jaccard_edges.sort(key=lambda x: x[2], reverse=True)
    print(f"\nTop 5 most similar editor pairs:")
    for i, (e1, e2, weight) in enumerate(jaccard_edges[:5]):
        name1 = editor_projection_weighted.nodes[e1]['label']
        name2 = editor_projection_weighted.nodes[e2]['label']
        shared = editor_projection_weighted[e1][e2]['shared_pages']
        total = editor_projection_weighted[e1][e2]['total_unique_pages']
        print(f"  {i+1}. {name1} ↔ {name2}")
        print(f"     Jaccard: {weight:.4f} ({shared} shared / {total} total pages)")

  Jaccard weight range: 0.0312 - 1.0000
  Average Jaccard weight: 0.5936

Top 5 most similar editor pairs:
  1. 98.239.93.41 ↔ J. Finkelstein
     Jaccard: 1.0000 (1 shared / 1 total pages)
  2. 98.239.93.41 ↔ Fenice
     Jaccard: 1.0000 (1 shared / 1 total pages)
  3. 98.239.93.41 ↔ Ericdn
     Jaccard: 1.0000 (1 shared / 1 total pages)
  4. 98.239.93.41 ↔ Michael Snow
     Jaccard: 1.0000 (1 shared / 1 total pages)
  5. 98.239.93.41 ↔ Stevertigo
     Jaccard: 1.0000 (1 shared / 1 total pages)


This approach overstresses cases where two editors have exactly one page in common (because then you have a 100% overlap and therefore high similarity). 

#### Create alternative weighted projection using edit counts (activity based)

In [27]:
# This creates edges weighted by the sum of edit counts on shared pages
editor_projection_edits = nx.Graph()

In [29]:
# Add all editor nodes
for node in editor_nodes_set:
    editor_name = nodelist_df[nodelist_df[EDITOR_ID_COL] == node][EDITOR_NAME_COL].iloc[0]
    editor_projection_edits.add_node(node, label=editor_name)

In [30]:
# Get edit counts for each editor-page pair
editor_page_edits = {}
for _, row in edgelist_df.iterrows():
    editor = row[SOURCE_COL]
    page = row[TARGET_COL]
    edits = row['weight']
    
    if editor not in editor_page_edits:
        editor_page_edits[editor] = {}
    editor_page_edits[editor][page] = edits

In [31]:
# Calculate edit-based weights between editors
for i, editor1 in enumerate(editor_list):
    if i % 50 == 0:
        print(f"  Processing editor {i}/{len(editor_list)}")
    
    for editor2 in editor_list[i+1:]:
        # Find shared pages
        pages1 = set(editor_page_edits.get(editor1, {}).keys())
        pages2 = set(editor_page_edits.get(editor2, {}).keys())
        shared_pages = pages1 & pages2
        
        if len(shared_pages) > 0:
            # Calculate total edits on shared pages
            total_shared_edits = 0
            for page in shared_pages:
                edits1 = editor_page_edits[editor1][page]
                edits2 = editor_page_edits[editor2][page]
                total_shared_edits += edits1 + edits2
            
            # Add edge with edit-based weight
            editor_projection_edits.add_edge(editor1, editor2,
                                           edit_weight=total_shared_edits,
                                           shared_pages=len(shared_pages))

  Processing editor 0/979
  Processing editor 50/979
  Processing editor 100/979
  Processing editor 150/979
  Processing editor 200/979
  Processing editor 250/979
  Processing editor 300/979
  Processing editor 350/979
  Processing editor 400/979
  Processing editor 450/979
  Processing editor 500/979
  Processing editor 550/979
  Processing editor 600/979
  Processing editor 650/979
  Processing editor 700/979
  Processing editor 750/979
  Processing editor 800/979
  Processing editor 850/979
  Processing editor 900/979
  Processing editor 950/979


In [32]:
print(f"\nEdit-weighted editor projection:")
print(f"  Nodes: {editor_projection_edits.number_of_nodes()}")
print(f"  Edges: {editor_projection_edits.number_of_edges()}")


Edit-weighted editor projection:
  Nodes: 979
  Edges: 11680


In [33]:
if editor_projection_edits.number_of_edges() > 0:
    edit_weights = [d['edit_weight'] for u, v, d in editor_projection_edits.edges(data=True)]
    print(f"  Edit weight range: {min(edit_weights)} - {max(edit_weights)}")
    print(f"  Average edit weight: {sum(edit_weights)/len(edit_weights):.2f}")
    
    # Show highest edit-weighted pairs
    edit_edges = [(u, v, d['edit_weight']) for u, v, d in editor_projection_edits.edges(data=True)]
    edit_edges.sort(key=lambda x: x[2], reverse=True)
    
    print(f"\nTop 5 editor pairs by total edits on shared pages:")
    for i, (e1, e2, weight) in enumerate(edit_edges[:5]):
        name1 = editor_projection_edits.nodes[e1]['label']
        name2 = editor_projection_edits.nodes[e2]['label']
        shared = editor_projection_edits[e1][e2]['shared_pages']
        print(f"  {i+1}. {name1} ↔ {name2}")
        print(f"     Total edits on shared pages: {weight} ({shared} shared pages)")

  Edit weight range: 2 - 1041
  Average edit weight: 27.62

Top 5 editor pairs by total edits on shared pages:
  1. Balloonman ↔ 98.245.148.9
     Total edits on shared pages: 1041 (1 shared pages)
  2. Deisenbe ↔ Malik Shabazz
     Total edits on shared pages: 961 (3 shared pages)
  3. Parkwells ↔ Deisenbe
     Total edits on shared pages: 889 (5 shared pages)
  4. Deisenbe ↔ North Shoreman
     Total edits on shared pages: 853 (3 shared pages)
  5. Balloonman ↔ Telemachus.forward
     Total edits on shared pages: 828 (1 shared pages)


#### Save new network data

In [34]:
# Save the different projections as edgelists
# Simple projection
if editor_projection_simple.number_of_edges() > 0:
    simple_edges = []
    for u, v in editor_projection_simple.edges():
        name1 = editor_projection_simple.nodes[u]['label']
        name2 = editor_projection_simple.nodes[v]['label']
        simple_edges.append({'source': u, 'target': v, 'source_name': name1, 'target_name': name2})
    
    pd.DataFrame(simple_edges).to_csv('editor_projection_simple.csv', index=False)

In [35]:
# Jaccard weighted projection
if editor_projection_weighted.number_of_edges() > 0:
    jaccard_edges_data = []
    for u, v, d in editor_projection_weighted.edges(data=True):
        name1 = editor_projection_weighted.nodes[u]['label']
        name2 = editor_projection_weighted.nodes[v]['label']
        jaccard_edges_data.append({
            'source': u, 'target': v,
            'source_name': name1, 'target_name': name2,
            'jaccard_weight': d['jaccard_weight'],
            'shared_pages': d['shared_pages'],
            'total_unique_pages': d['total_unique_pages']
        })
    
    pd.DataFrame(jaccard_edges_data).to_csv('editor_projection_jaccard.csv', index=False)

In [36]:
# Edit weighted projection
if editor_projection_edits.number_of_edges() > 0:
    edit_edges_data = []
    for u, v, d in editor_projection_edits.edges(data=True):
        name1 = editor_projection_edits.nodes[u]['label']
        name2 = editor_projection_edits.nodes[v]['label']
        edit_edges_data.append({
            'source': u, 'target': v,
            'source_name': name1, 'target_name': name2,
            'edit_weight': d['edit_weight'],
            'shared_pages': d['shared_pages']
        })
    
    pd.DataFrame(edit_edges_data).to_csv('editor_projection_edits.csv', index=False)