In [1]:
import pandas as pd
import networkx as nx
from itertools import combinations

In [2]:
sug = pd.read_csv("top100_suggestions.csv") \
        .rename(columns={"author": "author_sug"})
com = pd.read_csv("top100_comments.csv") \
        .rename(columns={"author": "author_com"})

In [3]:
# Define explicit datetime format for your timestamp columns
date_format = "%m/%d/%Y %I:%M %p"

In [4]:

# Parse timestamps, coercing malformed entries to NaT
sug["timestamp"] = pd.to_datetime(
    sug["timestamp"],
    format=date_format,
    errors="coerce"
)
com["timestamp"] = pd.to_datetime(
    com["timestamp"],
    format=date_format,
    errors="coerce"
)

In [5]:

# Drop any rows missing a suggestionId, author, or valid timestamp
sug.dropna(subset=["suggestionId", "author_sug", "timestamp"], inplace=True)
com.dropna(subset=["suggestionId", "author_com", "timestamp"], inplace=True)

print("After cleaning:")
print("  Suggestions:", sug.shape)  # Expect ~116673 × 8
print("  Comments:   ", com.shape)  # Expect ~237925 × 6

After cleaning:
  Suggestions: (100, 7)
  Comments:    (3933, 5)


In [6]:
# --- 2. Node & Edge Construction ---

# A. Directed Comment Flow (comment-author -> suggestion-author)
merged = com[["suggestionId","author_com"]].merge(
    sug[["suggestionId","author_sug"]],
    on="suggestionId"
)
edge_flow = (
    merged
    .groupby(["author_com","author_sug"])
    .size()
    .reset_index(name="weight")
    .rename(columns={"author_com":"src","author_sug":"dst"})
)

In [7]:
# B. Undirected Co-commenter (users who commented the same suggestion)
co_pairs = []
for _, grp in com.groupby("suggestionId")["author_com"]:
    users = set(grp)
    for u, v in combinations(users, 2):
        co_pairs.append((u, v))
df_co = pd.DataFrame(co_pairs, columns=["u","v"])
edge_co = (
    df_co
    .groupby(["u","v"])
    .size()
    .reset_index(name="weight")
)

In [8]:
# C. Suggestion Projection (suggestions linked by shared users)
# Build a user–suggestion mapping from both authorship and comments
sug_edges = pd.concat([
    sug[["suggestionId","author_sug"]].rename(columns={"author_sug":"user"}),
    com[["suggestionId","author_com"]].rename(columns={"author_com":"user"})
]).drop_duplicates()

proj_pairs = []
for _, group in sug_edges.groupby("user")["suggestionId"]:
    for s1, s2 in combinations(set(group), 2):
        proj_pairs.append((s1, s2))
df_proj = pd.DataFrame(proj_pairs, columns=["s1","s2"])
edge_proj = (
    df_proj
    .groupby(["s1","s2"])
    .size()
    .reset_index(name="weight")
)


In [9]:
# D. Bipartite User–Suggestion
bip_edges = (
    sug_edges
    .groupby(["user","suggestionId"])
    .size()
    .reset_index(name="weight")
)


In [10]:
# --- 3. Graph Assembly & Export to GEXF ---

# 3A. Directed Comment Flow Graph
G_flow = nx.DiGraph()
for _, row in edge_flow.iterrows():
    G_flow.add_edge(row.src, row.dst, weight=int(row.weight))
nx.write_gexf(G_flow, "./gephi_100/comment_flow.gexf")


In [11]:
# 3B. Undirected Co-commenter Graph
G_co = nx.Graph()
for _, row in edge_co.iterrows():
    G_co.add_edge(row.u, row.v, weight=int(row.weight))
nx.write_gexf(G_co, "./gephi_100/co_commenters.gexf")


In [12]:
# 3C. Suggestion Projection Graph (undirected)
G_proj = nx.Graph()
for _, row in edge_proj.iterrows():
    # prefix 'sug_' to avoid name collisions with user names
    G_proj.add_edge(f"sug_{row.s1}", f"sug_{row.s2}", weight=int(row.weight))
nx.write_gexf(G_proj, "./gephi_100/suggestion_projection.gexf")


In [13]:
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
import pandas as pd

# --- 1. Load the co-commenter graph ---
G = nx.read_gexf("gephi_100/co_commenters.gexf")

# --- 2. Community Detection (Greedy Modularity) ---
communities = list(greedy_modularity_communities(G, weight='weight'))
node_to_comm = {node: cid for cid, comm in enumerate(communities) for node in comm}


In [14]:
# --- 3. Community Sizes ---
comm_sizes = pd.DataFrame({
    'community_id': list(range(len(communities))),
    'size': [len(comm) for comm in communities]
})
comm_sizes.to_csv("community_sizes.csv", index=False)


In [15]:
# --- 4. Inter-Community Edge Weights ---
inter = {}
for u, v, data in G.edges(data=True):
    cu, cv = node_to_comm[u], node_to_comm[v]
    if cu != cv:
        key = tuple(sorted((cu, cv)))
        inter[key] = inter.get(key, 0) + data.get('weight', 1)

In [16]:

inter_df = pd.DataFrame([
    {'community_1': k[0], 'community_2': k[1], 'weight': w}
    for k, w in inter.items()
]).sort_values('weight', ascending=False)
inter_df.to_csv("inter_community_weights.csv", index=False)

In [17]:
# --- 5. Identify Bridge-Nodes (Inter-Community Connectors) ---
# Betweenness centrality
betweenness = nx.betweenness_centrality(G, weight='weight')

In [18]:
# Sum of weights on edges that cross communities
inter_edge_weight = {node: 0 for node in G.nodes()}
for u, v, data in G.edges(data=True):
    if node_to_comm[u] != node_to_comm[v]:
        w = data.get('weight', 1)
        inter_edge_weight[u] += w
        inter_edge_weight[v] += w

In [19]:
bridge_df = pd.DataFrame([
    {
        'node': node,
        'community': node_to_comm[node],
        'betweenness': betweenness[node],
        'inter_edge_weight': inter_edge_weight[node]
    }
    for node in G.nodes()
]).sort_values('inter_edge_weight', ascending=False)
bridge_df.to_csv("bridge_nodes.csv", index=False)

print("Results saved to CSV:")
print(" - community_sizes.csv")
print(" - inter_community_weights.csv")
print(" - bridge_nodes.csv")

Results saved to CSV:
 - community_sizes.csv
 - inter_community_weights.csv
 - bridge_nodes.csv


In [22]:
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

# 1. Load your co‐commenter graph
G = nx.read_gexf("gephi_100/co_commenters.gexf")

# 2. Detect communities
communities = list(greedy_modularity_communities(G, weight='weight'))

# 3. Build a mapping node → community_id
node_to_comm = {
    node: comm_id
    for comm_id, comm in enumerate(communities)
    for node in comm
}

# 4. Attach the community as a node attribute
nx.set_node_attributes(G, node_to_comm, name="community")

# 5. Attach community size and total count as graph attributes
G.graph['num_communities'] = len(communities)
comm_sizes = {comm_id: len(comm) for comm_id, comm in enumerate(communities)}
for node, comm_id in node_to_comm.items():
    G.nodes[node]['community_size'] = comm_sizes[comm_id]

# 6. Export to GEXF for Gephi
nx.write_gexf(G, "gephi_100/co_commenters_top100_communities.gexf")

print("Exported co_commenters_top100_communities.gexf with community attributes.")


Exported co_commenters_top100_communities.gexf with community attributes.
