In [24]:
import pandas as pd
import numpy  as np
import networkx as nx
from pathlib import Path
from itertools import combinations
import community              as community_louvain     # python‑louvain
from networkx.algorithms.community import quality      # modularity
import statsmodels.formula.api as smf

In [25]:
SUG_FILE   = "top100_suggestions.csv"
COM_FILE   =  "top100_comments.csv"

In [26]:
dtype_id = {"suggestionId": int, "commentId": int}

In [27]:
print("Loading CSVs...")
sug = pd.read_csv(SUG_FILE)
com = pd.read_csv(COM_FILE)

Loading CSVs...


In [28]:
# ===== 2. Build edge lists =================================================
print("Building edge lists...")

# 2a. Comment‑flow  (commenter -> suggestion author)
edge_flow = (
    com.merge(
        sug[["suggestionId", "author"]],
        on="suggestionId",
        suffixes=("_com", "_sug"),
    )
    .groupby(["author_com", "author_sug"])
    .size()
    .reset_index(name="weight")
    .query("author_com != author_sug")
)


Building edge lists...


In [29]:
# 2b. Co‑commenter (users who commented on the same suggestion)
pairs = (
    com[["suggestionId", "author"]]
    .drop_duplicates()
    .merge(com[["suggestionId", "author"]], on="suggestionId")
    .query("author_x < author_y")                        # remove self & symmetric dupes
    .value_counts(["author_x", "author_y"])
    .reset_index(name="weight")
    .rename(columns={"author_x": "u", "author_y": "v"})
)

In [30]:
# 2c. User‑Idea bipartite (author OR commenter edges)
ui_edges = pd.concat(
    [
        sug[["author", "suggestionId"]].rename(
            columns={"author": "user", "suggestionId": "idea"}
        ),
        com[["author", "suggestionId"]].rename(
            columns={"author": "user", "suggestionId": "idea"}
        ),
    ],
    ignore_index=True,
).drop_duplicates()

In [31]:
# 2d. Suggestion projection (ideas linked via shared users)
sugg_pairs = (
    ui_edges.merge(ui_edges, on="user")
    .query("idea_x < idea_y")
    .value_counts(["idea_x", "idea_y"])
    .reset_index(name="weight")
    .rename(columns={"idea_x": "s1", "idea_y": "s2"})
)

In [32]:
# ──────────────────────────────────────────────────────────────────────────────
# 3. Build NetworkX graphs
# ──────────────────────────────────────────────────────────────────────────────
print("Constructing NetworkX objects…")

G_flow = nx.DiGraph()
G_flow.add_weighted_edges_from(
    edge_flow[["author_com", "author_sug", "weight"]].values
)

G_co = nx.Graph()
G_co.add_weighted_edges_from(pairs[["u", "v", "weight"]].values)

G_bip = nx.Graph()
G_bip.add_nodes_from(ui_edges["user"].unique(), bipartite="user")
G_bip.add_nodes_from(ui_edges["idea"].unique(), bipartite="idea")
G_bip.add_edges_from(ui_edges[["user", "idea"]].values)

G_proj = nx.Graph()
G_proj.add_weighted_edges_from(sugg_pairs[["s1", "s2", "weight"]].values)

# ──────────────────────────────────────────────────────────────────────────────
# 4b.  Role tagging  (pure‑heuristic, optional override file)
# ──────────────────────────────────────────────────────────────────────────────
print("Tagging roles…")

# -- 1. Build per‑user activity metrics ---------------------------------------
comment_cnt = com["author"].value_counts()
suggest_cnt = sug["author"].value_counts()
vote_totals = sug.groupby("author")["votes"].sum()

user_metrics = (
    pd.DataFrame({"comments": comment_cnt,
                  "suggestions": suggest_cnt,
                  "votes": vote_totals})
      .fillna(0)
)

# -- 2. Thresholds for “contributor” status -----------------------------------
activity_cut = user_metrics["comments"].quantile(0.90)   # top 10 % by comments
vote_cut     = user_metrics["votes"].quantile(0.95)      # or top 5 % by total votes

def infer_role(user: str, row) -> str:
    """Heuristic mapping -> expert / contributor / client."""
    uname = str(user).lower()
    if uname.startswith(("sbx", "starbucks_")):           # employee / expert flag
        return "expert"
    if (row["comments"] >= activity_cut) or (row["votes"] >= vote_cut):
        return "contributor"
    return "client"

role_dict = {
    user: infer_role(user, row) for user, row in user_metrics.iterrows()
}

# -- 3. Optional override via external CSV ------------------------------------
# If someday you receive an authoritative mapping file:
# if ROLE_MAP.exists():
#     print("Override: applying roles from", ROLE_MAP.name)
#     overrides = pd.read_csv(ROLE_MAP)        # columns: user, role
#     role_dict.update(dict(zip(overrides["user"], overrides["role"])))

# -- 4. Attach roles to graph nodes -------------------------------------------
# Any node missing from user_metrics (edge‑case) defaults to 'client'
for u in G_co.nodes():
    G_co.nodes[u]["role"] = role_dict.get(u, "client")


Constructing NetworkX objects…
Tagging roles…


In [34]:
# ──────────────────────────────────────────────────────────────────────────────
# 4. Community detection & quality
# ──────────────────────────────────────────────────────────────────────────────
print("Running Louvain…")
part_co   = community_louvain.best_partition(G_co,   weight="weight")
part_proj = community_louvain.best_partition(G_proj, weight="weight")

nx.set_node_attributes(G_co,   part_co,   "community")
nx.set_node_attributes(G_proj, part_proj, "community")

print("Calculating modularity…")
Q_co   = quality.modularity(G_co,   [ {n for n,c in part_co.items()   if c==k} for k in set(part_co.values()) ],   weight="weight")
Q_proj = quality.modularity(G_proj, [ {n for n,c in part_proj.items() if c==k} for k in set(part_proj.values()) ], weight="weight")


Running Louvain…
Calculating modularity…


In [35]:
# --------------------------------------------------------------------------- #
# 6.  Analytics tables
# --------------------------------------------------------------------------- #
print("Generating analytics…")

# 6a. Community sizes
comm_sizes = (
    pd.Series(part_co)
      .value_counts()
      .rename_axis("community")
      .reset_index(name="size")
      .sort_values("community")
)
comm_sizes["modularity_G_co"] = Q_co
comm_sizes.to_csv("gephi_100_updated/community_sizes_co.csv", index=False)

# 6b. Inter‑community edge weights
edges_inter = [
    {"c1": min(part_co[u], part_co[v]),
     "c2": max(part_co[u], part_co[v]),
     "weight": d["weight"]}
    for u, v, d in G_co.edges(data=True) if part_co[u] != part_co[v]
]
pd.DataFrame(edges_inter)\
  .groupby(["c1", "c2"])["weight"].sum()\
  .reset_index()\
  .to_csv("gephi_100_updated/inter_edges_co.csv", index=False)

# 6c. Role mixing matrix & assortativity
roles      = nx.get_node_attributes(G_co, "role")
role_set   = sorted(set(roles.values()))
mix_mtx    = pd.DataFrame(0, index=role_set, columns=role_set, dtype=int)
for u, v in G_co.edges():
    mix_mtx.loc[roles[u], roles[v]] += 1
mix_mtx.to_csv("gephi_100_updated/role_mixing_co.csv")

assort = nx.attribute_assortativity_coefficient(G_co, "role")

# 6d. Centrality by role
print("Computing centralities…")
btw  = nx.betweenness_centrality(G_co, weight="weight")
deg  = dict(G_co.degree(weight="weight"))

cent_rows = []
for u in G_co.nodes():
    cent_rows.append({
        "user": u,
        "role": roles[u],
        "degree_w": deg[u],
        "betweenness": btw[u],
        "community": part_co[u]
    })
centrality_df = pd.DataFrame(cent_rows)
centrality_df.sort_values(["role", "betweenness"], ascending=[True, False])\
             .to_csv("gephi_100_updated/centrality_by_role.csv", index=False)


Generating analytics…
Computing centralities…


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 7.  Idea‑level feature table & modelling  (robust version)
# ──────────────────────────────────────────────────────────────────────────────
print("Building idea feature set…")

# ── 7.1  Assemble features ───────────────────────────────────────────────────
n_comments = com.groupby("suggestionId").size().rename("n_comments")

author_btw  = {u: btw.get(u, 0) for u in sug["author"]}
author_deg  = {u: deg.get(u, 0) for u in sug["author"]}
author_comm = {u: part_co.get(u, -1) for u in sug["author"]}
author_role = {u: roles.get(u, "unknown") for u in sug["author"]}

idea_df = sug.copy()
idea_df = idea_df.merge(n_comments, left_on="suggestionId", right_index=True, how="left")
idea_df["n_comments"].fillna(0, inplace=True)

idea_df["author_betweenness"] = idea_df["author"].map(author_btw)
idea_df["author_degree_w"]    = idea_df["author"].map(author_deg)
idea_df["author_community"]   = idea_df["author"].map(author_comm)
idea_df["author_role"]        = idea_df["author"].map(author_role)

# Success label
if "implemented" in idea_df.columns:
    idea_df["success"] = idea_df["implemented"].astype(int)
else:
    top_decile = idea_df["votes"].quantile(0.90)
    idea_df["success"] = (idea_df["votes"] >= top_decile).astype(int)

idea_df.to_csv("gephi_100_updated/idea_features.csv", index=False)

# ── 7.2  Prepare design matrix w/ patsy ───────────────────────────────────────
import patsy
formula = "success ~ votes + n_comments + author_betweenness + C(category)"

y, X = patsy.dmatrices(formula, data=idea_df, return_type="dataframe")

# Drop constant columns (zero variance) to avoid singularity
constant_cols = [col for col in X.columns if X[col].nunique() == 1]
if constant_cols:
    print("Dropping constant predictors:", constant_cols)
    X = X.drop(columns=constant_cols)

# ── 7.3  Check rank & condition number ───────────────────────────────────────
rank = np.linalg.matrix_rank(X.values)
if rank < X.shape[1]:
    print(f"Warning: design matrix not full rank ({rank}/{X.shape[1]}). "
          "Attempting to drop collinear columns…")
    # Simple VIF‑based filter
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    keep = []
    for i, col in enumerate(X.columns):
        vif = variance_inflation_factor(X.values, i)
        if np.isfinite(vif) and vif < 50:      # threshold; adjust as needed
            keep.append(col)
        else:
            print(f"  dropping {col} (VIF ≈ {vif:.1f})")
    X = X[keep]

# ── 7.4  Fit logistic model with graceful back‑off ───────────────────────────
print("Fitting logistic regression…")
import statsmodels.api as sm

try:
    logit_model = sm.Logit(y, X).fit(disp=False)
except np.linalg.LinAlgError:
    print("  Singular matrix persists. Switching to L2‑regularised (lbfgs) fit…")
    logit_model = sm.Logit(y, X).fit(
        disp=False,
        method="lbfgs",
        maxiter=100,
        penalization="l2",
        alpha=1.0
    )

with open("gephi_100_updated/logit_summary.txt", "w") as f:
    f.write(logit_model.summary().as_text())

print("Logit pseudo‑R²:", logit_model.prsquared)

# # ── 7.5  (Optional) Cox PH if timestamps present & lifelines available ───────
# if HAS_LIFELINES and {"created_ts", "implemented_ts"}.issubset(idea_df.columns):
#     print("Fitting Cox model…")
#     idea_df["duration"] = (
#         pd.to_datetime(idea_df["implemented_ts"])
#       - pd.to_datetime(idea_df["created_ts"])
#     ).dt.days
#     idea_df = idea_df.dropna(subset=["duration"])
#     if len(idea_df) > 30:                                 # need enough rows
#         cph = CoxPHFitter()
#         cph.fit(
#             idea_df[
#                 ["duration", "success", "votes", "n_comments", "author_betweenness"]
#             ],
#             duration_col="duration",
#             event_col="success",
#         )
#         with open("gephi_100_updated/cox_summary.txt", "w") as f:
#             f.write(cph.summary.to_string())
#     else:
#         print("  Not enough complete duration records for Cox model.")
# else:
#     print("Cox model skipped (lifelines not installed or timestamp cols missing).")


Building idea feature set…
Dropping constant predictors: ['Intercept']
Fitting logistic regression…
  Singular matrix persists. Switching to L2‑regularised (lbfgs) fit…
Logit pseudo‑R²: -1.1322161947352445


In [38]:
# --------------------------------------------------------------------------- #
# 8.  Export graphs
# --------------------------------------------------------------------------- #
print("Writing GEXF layers…")
nx.write_gexf(G_flow, "gephi_100_updated/comment_flow.gexf")
nx.write_gexf(G_co,   "gephi_100_updated/co_commenter.gexf")
nx.write_gexf(G_bip,  "gephi_100_updated/user_idea_bipartite.gexf")
nx.write_gexf(G_proj, "gephi_100_updated/suggestion_projection.gexf")

# --------------------------------------------------------------------------- #
# 9.  Final report
# --------------------------------------------------------------------------- #
print("=" * 60)
print("Pipeline complete.  Key stats")
print("- Number of users           :", G_co.number_of_nodes())
print("- Number of comments edges  :", G_flow.number_of_edges())
print(f"- Modularity (G_co)         : {Q_co:.3f}")
print(f"- Role assortativity (G_co) : {assort:.3f}")
print("- Logistic LL / Pseudo‑R²   :", logit_model.llf, "/", logit_model.prsquared)
print("Outputs saved to:" +"/gephi_100_updated")
print("=" * 60)


Writing GEXF layers…
Pipeline complete.  Key stats
- Number of users           : 3075
- Number of comments edges  : 3432
- Modularity (G_co)         : 0.842
- Role assortativity (G_co) : -0.001
- Logistic LL / Pseudo‑R²   : -69.31471805599453 / -1.1322161947352445
Outputs saved to:/gephi_100_updated
