In [None]:
import networkx as nx
import random
import powerlaw
import numpy as np
import matplotlib.pyplot as plt


# Random Graph Genration
Nodes:
- Author
- Commit
- PR
Parameters:
- num_commits
- num_auhtors
- pr_exponent (adjusting pr generation)
- base_probability (adjusting pr generation)
- advantage_factor (adjusting author-pr connection )  This is not added in the parameter but you can change it thorgh changing the variable`advantage_factor` under function `preferential_attachment`

In [None]:
def generate_powerlaw_probability(exponent=2.5, xmin=1):
    return powerlaw.Power_Law(xmin=xmin, parameters=[exponent]).generate_random(1)[0]


def preferential_attachment(authors, G):
    advantage_factor = 2 
    """Preferential attachment mechanism to select an author."""
    # Calculate the adjusted author PR counts with a virtual link
    author_pr_counts = [len(list(G.successors(author))) + 1 for author in authors]
    adjusted_counts = [count**advantage_factor for count in author_pr_counts]
    
    total_prs = sum(adjusted_counts)
    if total_prs == 0:
        return random.choice(authors)
    
    probs = [count / total_prs for count in adjusted_counts]
    return np.random.choice(authors, p=probs)

def create_random_github_graph(num_commits, num_authors, pr_exponent=2.5, base_probability=0.5):
    G = nx.DiGraph()
    
    # Create commit nodes
    commits = [f'commit_{i}' for i in range(num_commits)]
    G.add_nodes_from(commits, node_type='commit')
    
    # Create PR nodes and connect commits
    prs = []
    current_pr = None
    current_pr_commit_count = 0
    
    for i in range(len(commits)):
        if i > 0:
            G.add_edge(commits[i-1], commits[i], edge_type='commit_to_commit')

        if current_pr is None or random.random() < base_probability * (1 / (current_pr_commit_count + 1)**(1 / pr_exponent)):
            # add new PR
            pr = f'pr_{len(prs)}'
            prs.append(pr)
            G.add_node(pr, node_type='pr')
            G.add_edge(commits[i], pr, edge_type='commit_to_pr')
            current_pr = pr
            current_pr_commit_count = 1
        else:
            # add commit to prev PR
            G.add_edge(commits[i], current_pr, edge_type='commit_to_pr')
            current_pr_commit_count += 1
    
    # Assign authors to PRs
    authors = [f'author_{i}' for i in range(num_authors)]
    G.add_nodes_from(authors, node_type='author')
    for author in authors:
        random_pr = random.choice(prs)
        G.add_edge(random_pr, author, edge_type='pr_to_author')
    for pr in prs:
        author = preferential_attachment(authors, G)
        G.add_edge(pr, author, edge_type='pr_to_author')
    return G

In [None]:

# Example usage
num_commits = 2000
num_authors = 296
pr_exponent = 2.5
base_probability = 0.7
G = create_random_github_graph(num_commits, num_authors, pr_exponent, base_probability)

# Position authors in a horizontal line at y=1
pos = {}
author_x = 0
for author in [n for n in G.nodes if G.nodes[n]['node_type'] == 'author']:
    pos[author] = (author_x, 1)
    author_x += num_commits / num_authors

# Position commits in a horizontal line at y=-1
for i, commit in enumerate([n for n in G.nodes if G.nodes[n]['node_type'] == 'commit']):
    pos[commit] = (i, -1)

# Position PRs in a horizontal line at y=0, between commits and authors
for pr in [n for n in G.nodes if G.nodes[n]['node_type'] == 'pr']:
    connected_commits = [commit for commit in G.predecessors(pr) if G.nodes[commit]['node_type'] == 'commit']
    if connected_commits:
        avg_x = sum(pos[commit][0] for commit in connected_commits) / len(connected_commits)
        pos[pr] = (avg_x, 0)
    else:
        pos[pr] = (random.uniform(0, num_commits), 0)

# Define colors for the visualization
node_colors = [G.nodes[node]['node_type'] for node in G.nodes]
node_color_map = {'commit': 'red', 'pr': 'blue', 'author': 'green'}
colors = [node_color_map[G.nodes[node]['node_type']] for node in G.nodes]

# Draw the graph
plt.figure(figsize=(12, 8))
nx.draw(G, pos, with_labels=False, node_color=colors, node_size=200, font_size=10, font_color='white')

# Create legend
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) for color in node_color_map.values()]
labels = node_color_map.keys()
plt.legend(handles, labels, loc='upper left')
plt.title("Randomly Generated GitGrph")
plt.show()

In [None]:
def print_graph_statistics(G):
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    num_commits = sum(1 for _, attr in G.nodes(data=True) if attr['node_type'] == 'commit')
    num_prs = sum(1 for _, attr in G.nodes(data=True) if attr['node_type'] == 'pr')
    num_authors = sum(1 for _, attr in G.nodes(data=True) if attr['node_type'] == 'author')
    degrees = [degree for _, degree in G.degree()]
    avg_degree = np.mean(degrees)
    max_degree = np.max(degrees)
    min_degree = np.min(degrees)
    
    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}")
    print(f"Number of commit nodes: {num_commits}")
    print(f"Number of PR nodes: {num_prs}")
    print(f"Number of author nodes: {num_authors}")
    print(f"Average degree: {avg_degree:.2f}")
    print(f"Maximum degree: {max_degree}")
    print(f"Minimum degree: {min_degree}")
    
    # Author statistics
    author_pr_counts = [len(list(G.successors(author))) for author in G.nodes if G.nodes[author]['node_type'] == 'author']
    avg_prs_per_author = np.mean(author_pr_counts)
    max_prs_per_author = np.max(author_pr_counts)
    min_prs_per_author = np.min(author_pr_counts)
    
    print(f"Average number of PRs per author: {avg_prs_per_author:.2f}")
    print(f"Maximum number of PRs for an author: {max_prs_per_author}")
    print(f"Minimum number of PRs for an author: {min_prs_per_author}")
    
    # Degree distribution scatter plot with log binning
    degree_count = np.bincount(degrees)
    degree = np.arange(len(degree_count))
    nonzero_indices = degree_count > 0
    degree = degree[nonzero_indices]
    degree_count = degree_count[nonzero_indices]

    plt.figure(figsize=(10, 6))
    plt.scatter(degree, degree_count, color='b', alpha=0.6, edgecolors='k')
    plt.xscale('log')
    plt.yscale('log')
    plt.title('Degree Distribution (Log-Log Scale)')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    plt.grid(True, which="both", ls="--")
    plt.show()

    # similar to the Degree distirbtuion above, create one for author nodes
    author_degrees = [degree for author, degree in G.degree() if G.nodes[author]['node_type'] == 'author']
    degree_count = np.bincount(author_degrees)
    degree = np.arange(len(degree_count))
    nonzero_indices = degree_count > 0
    degree = degree[nonzero_indices]
    degree_count = degree_count[nonzero_indices]
    fig, axs = plt.subplots(1, 2, figsize=(20, 10))
    #axs[0].figure(0,1,figsize=(10, 6))
    axs[0].scatter(degree, degree_count, color='b', alpha=0.6, edgecolors='k')
    axs[0].plot(degree, degree_count, color='r', alpha=0.6)
    axs[0].set_xscale('linear')
    axs[0].set_yscale('linear')
    axs[0].set_title('Author Distribution (Log-Log Scale)')
    axs[0].set_xlabel('Degree')
    axs[0].set_ylabel('Frequency')
    axs[0].grid(True, which="both", ls="--")
    #axs[0].show()


    # similar to the Degree distirbtuion above, create one for author nodes
    pr_degrees = [degree for author, degree in G.degree() if G.nodes[author]['node_type'] == 'pr']
    degree_count = np.bincount(pr_degrees)
    degree = np.arange(len(degree_count))
    nonzero_indices = degree_count > 0
    degree = degree[nonzero_indices]
    degree_count = degree_count[nonzero_indices]
    #plt.figure(figsize=(10, 6))
    
    axs[1].scatter(degree, degree_count, color='b', alpha=0.6, edgecolors='k')
    axs[1].plot(degree, degree_count, color='r', alpha=0.6)
    axs[1].set_xscale('log')
    axs[1].set_yscale('log')
    axs[1].set_title('PR Distribution (Log-Log Scale)')
    axs[1].set_xlabel('Degree')
    axs[1].set_ylabel('Frequency')
    axs[1].grid(True, which="both", ls="--")
    # Adjust the layout
    plt.tight_layout()

    # Show the figure
    plt.show()

print_graph_statistics(G)