In [2]:
import pandas as pd
import networkx as nx
from itertools import combinations

In [3]:
sug = pd.read_csv("sbf_suggestion.csv") \
        .rename(columns={"author": "author_sug"})
com = pd.read_csv("sbf_comment.csv") \
        .rename(columns={"author": "author_com"})

In [4]:
# Define explicit datetime format for your timestamp columns
date_format = "%m/%d/%Y %I:%M %p"

In [5]:

# Parse timestamps, coercing malformed entries to NaT
sug["timestamp"] = pd.to_datetime(
    sug["timestamp"],
    format=date_format,
    errors="coerce"
)
com["timestamp"] = pd.to_datetime(
    com["timestamp"],
    format=date_format,
    errors="coerce"
)

In [6]:

# Drop any rows missing a suggestionId, author, or valid timestamp
sug.dropna(subset=["suggestionId", "author_sug", "timestamp"], inplace=True)
com.dropna(subset=["suggestionId", "author_com", "timestamp"], inplace=True)

print("After cleaning:")
print("  Suggestions:", sug.shape)  # Expect ~116673 × 8
print("  Comments:   ", com.shape)  # Expect ~237925 × 6

After cleaning:
  Suggestions: (116673, 8)
  Comments:    (237925, 6)


In [7]:
# --- 2. Node & Edge Construction ---

# A. Directed Comment Flow (comment-author -> suggestion-author)
merged = com[["suggestionId","author_com"]].merge(
    sug[["suggestionId","author_sug"]],
    on="suggestionId"
)
edge_flow = (
    merged
    .groupby(["author_com","author_sug"])
    .size()
    .reset_index(name="weight")
    .rename(columns={"author_com":"src","author_sug":"dst"})
)

In [8]:
# B. Undirected Co-commenter (users who commented the same suggestion)
co_pairs = []
for _, grp in com.groupby("suggestionId")["author_com"]:
    users = set(grp)
    for u, v in combinations(users, 2):
        co_pairs.append((u, v))
df_co = pd.DataFrame(co_pairs, columns=["u","v"])
edge_co = (
    df_co
    .groupby(["u","v"])
    .size()
    .reset_index(name="weight")
)

In [9]:
# C. Suggestion Projection (suggestions linked by shared users)
# Build a user–suggestion mapping from both authorship and comments
sug_edges = pd.concat([
    sug[["suggestionId","author_sug"]].rename(columns={"author_sug":"user"}),
    com[["suggestionId","author_com"]].rename(columns={"author_com":"user"})
]).drop_duplicates()

proj_pairs = []
for _, group in sug_edges.groupby("user")["suggestionId"]:
    for s1, s2 in combinations(set(group), 2):
        proj_pairs.append((s1, s2))
df_proj = pd.DataFrame(proj_pairs, columns=["s1","s2"])
edge_proj = (
    df_proj
    .groupby(["s1","s2"])
    .size()
    .reset_index(name="weight")
)


In [10]:
# D. Bipartite User–Suggestion
bip_edges = (
    sug_edges
    .groupby(["user","suggestionId"])
    .size()
    .reset_index(name="weight")
)


In [11]:
# --- 3. Graph Assembly & Export to GEXF ---

# 3A. Directed Comment Flow Graph
G_flow = nx.DiGraph()
for _, row in edge_flow.iterrows():
    G_flow.add_edge(row.src, row.dst, weight=int(row.weight))
nx.write_gexf(G_flow, "./gephi/comment_flow.gexf")


In [12]:
# 3B. Undirected Co-commenter Graph
G_co = nx.Graph()
for _, row in edge_co.iterrows():
    G_co.add_edge(row.u, row.v, weight=int(row.weight))
nx.write_gexf(G_co, "./gephi/co_commenters.gexf")


In [None]:
# 3C. Suggestion Projection Graph (undirected)
G_proj = nx.Graph()
for _, row in edge_proj.iterrows():
    # prefix 'sug_' to avoid name collisions with user names
    G_proj.add_edge(f"sug_{row.s1}", f"sug_{row.s2}", weight=int(row.weight))
nx.write_gexf(G_proj, "./gephi/suggestion_projection.gexf")
