In [1]:
import pandas as pd
import numpy  as np
import networkx as nx
from pathlib import Path
from itertools import combinations
import community              as community_louvain     # python‑louvain
from networkx.algorithms.community import quality      # modularity
import statsmodels.formula.api as smf

In [2]:
SUG_FILE   = "combined_500_suggestions.csv"
COM_FILE   =  "combined_500_comments.csv"

In [3]:
dtype_id = {"suggestionId": int, "commentId": int}

In [4]:
print("Loading CSVs...")
sug = pd.read_csv(SUG_FILE)
com = pd.read_csv(COM_FILE)

Loading CSVs...


In [5]:
# ===== 2. Build edge lists =================================================
print("Building edge lists...")

# 2a. Comment‑flow  (commenter -> suggestion author)
edge_flow = (
    com.merge(
        sug[["suggestionId", "author"]],
        on="suggestionId",
        suffixes=("_com", "_sug"),
    )
    .groupby(["author_com", "author_sug"])
    .size()
    .reset_index(name="weight")
    .query("author_com != author_sug")
)


Building edge lists...


In [6]:
# 2b. Co‑commenter (users who commented on the same suggestion)
pairs = (
    com[["suggestionId", "author"]]
    .drop_duplicates()
    .merge(com[["suggestionId", "author"]], on="suggestionId")
    .query("author_x < author_y")                        # remove self & symmetric dupes
    .value_counts(["author_x", "author_y"])
    .reset_index(name="weight")
    .rename(columns={"author_x": "u", "author_y": "v"})
)

In [7]:
# 2c. User‑Idea bipartite (author OR commenter edges)
ui_edges = pd.concat(
    [
        sug[["author", "suggestionId"]].rename(
            columns={"author": "user", "suggestionId": "idea"}
        ),
        com[["author", "suggestionId"]].rename(
            columns={"author": "user", "suggestionId": "idea"}
        ),
    ],
    ignore_index=True,
).drop_duplicates()

In [8]:
# 2d. Suggestion projection (ideas linked via shared users)
sugg_pairs = (
    ui_edges.merge(ui_edges, on="user")
    .query("idea_x < idea_y")
    .value_counts(["idea_x", "idea_y"])
    .reset_index(name="weight")
    .rename(columns={"idea_x": "s1", "idea_y": "s2"})
)

In [9]:
# ──────────────────────────────────────────────────────────────────────────────
# 3. Build NetworkX graphs
# ──────────────────────────────────────────────────────────────────────────────
print("Constructing NetworkX objects…")

G_flow = nx.DiGraph()
G_flow.add_weighted_edges_from(
    edge_flow[["author_com", "author_sug", "weight"]].values
)

G_co = nx.Graph()
G_co.add_weighted_edges_from(pairs[["u", "v", "weight"]].values)

G_bip = nx.Graph()
G_bip.add_nodes_from(ui_edges["user"].unique(), bipartite="user")
G_bip.add_nodes_from(ui_edges["idea"].unique(), bipartite="idea")
G_bip.add_edges_from(ui_edges[["user", "idea"]].values)

G_proj = nx.Graph()
G_proj.add_weighted_edges_from(sugg_pairs[["s1", "s2", "weight"]].values)

# ──────────────────────────────────────────────────────────────────────────────
# 4b.  Role tagging  (pure‑heuristic, optional override file)
# ──────────────────────────────────────────────────────────────────────────────
print("Tagging roles…")

# -- 1. Build per‑user activity metrics ---------------------------------------
comment_cnt = com["author"].value_counts()
suggest_cnt = sug["author"].value_counts()
vote_totals = sug.groupby("author")["votes"].sum()

user_metrics = (
    pd.DataFrame({"comments": comment_cnt,
                  "suggestions": suggest_cnt,
                  "votes": vote_totals})
      .fillna(0)
)

# -- 2. Thresholds for “contributor” status -----------------------------------
activity_cut = user_metrics["comments"].quantile(0.90)   # top 10 % by comments
vote_cut     = user_metrics["votes"].quantile(0.95)      # or top 5 % by total votes

def infer_role(user: str, row) -> str:
    """Heuristic mapping -> expert / contributor / client."""
    uname = str(user).lower()
    if uname.startswith(("sbx", "starbucks_")):           # employee / expert flag
        return "expert"
    if (row["comments"] >= activity_cut) or (row["votes"] >= vote_cut):
        return "contributor"
    return "client"

role_dict = {
    user: infer_role(user, row) for user, row in user_metrics.iterrows()
}

# -- 3. Optional override via external CSV ------------------------------------
# If someday you receive an authoritative mapping file:
# if ROLE_MAP.exists():
#     print("Override: applying roles from", ROLE_MAP.name)
#     overrides = pd.read_csv(ROLE_MAP)        # columns: user, role
#     role_dict.update(dict(zip(overrides["user"], overrides["role"])))

# -- 4. Attach roles to graph nodes -------------------------------------------
# Any node missing from user_metrics (edge‑case) defaults to 'client'
for u in G_co.nodes():
    G_co.nodes[u]["role"] = role_dict.get(u, "client")


Constructing NetworkX objects…
Tagging roles…


In [10]:
# ──────────────────────────────────────────────────────────────────────────────
# 4. Community detection & quality
# ──────────────────────────────────────────────────────────────────────────────
print("Running Louvain…")
part_co   = community_louvain.best_partition(G_co,   weight="weight")
part_proj = community_louvain.best_partition(G_proj, weight="weight")

nx.set_node_attributes(G_co,   part_co,   "community")
nx.set_node_attributes(G_proj, part_proj, "community")

print("Calculating modularity…")
Q_co   = quality.modularity(G_co,   [ {n for n,c in part_co.items()   if c==k} for k in set(part_co.values()) ],   weight="weight")
Q_proj = quality.modularity(G_proj, [ {n for n,c in part_proj.items() if c==k} for k in set(part_proj.values()) ], weight="weight")


Running Louvain…
Calculating modularity…


In [11]:
# --------------------------------------------------------------------------- #
# 6.  Analytics tables
# --------------------------------------------------------------------------- #
print("Generating analytics…")

# 6a. Community sizes
comm_sizes = (
    pd.Series(part_co)
      .value_counts()
      .rename_axis("community")
      .reset_index(name="size")
      .sort_values("community")
)
comm_sizes["modularity_G_co"] = Q_co
comm_sizes.to_csv("gephi_1500/community_sizes_co.csv", index=False)

# 6b. Inter‑community edge weights
edges_inter = [
    {"c1": min(part_co[u], part_co[v]),
     "c2": max(part_co[u], part_co[v]),
     "weight": d["weight"]}
    for u, v, d in G_co.edges(data=True) if part_co[u] != part_co[v]
]
pd.DataFrame(edges_inter)\
  .groupby(["c1", "c2"])["weight"].sum()\
  .reset_index()\
  .to_csv("gephi_1500/inter_edges_co.csv", index=False)

# 6c. Role mixing matrix & assortativity
roles      = nx.get_node_attributes(G_co, "role")
role_set   = sorted(set(roles.values()))
mix_mtx    = pd.DataFrame(0, index=role_set, columns=role_set, dtype=int)
for u, v in G_co.edges():
    mix_mtx.loc[roles[u], roles[v]] += 1
mix_mtx.to_csv("gephi_1500/role_mixing_co.csv")

assort = nx.attribute_assortativity_coefficient(G_co, "role")

# 6d. Centrality by role
print("Computing centralities…")
btw  = nx.betweenness_centrality(G_co, weight="weight")
deg  = dict(G_co.degree(weight="weight"))

cent_rows = []
for u in G_co.nodes():
    cent_rows.append({
        "user": u,
        "role": roles[u],
        "degree_w": deg[u],
        "betweenness": btw[u],
        "community": part_co[u]
    })
centrality_df = pd.DataFrame(cent_rows)
centrality_df.sort_values(["role", "betweenness"], ascending=[True, False])\
             .to_csv("gephi_1500/centrality_by_role.csv", index=False)


Generating analytics…
Computing centralities…


In [40]:
# ──────────────────────────────────────────────────────────────────────────────
# 7.  Idea‑level feature table & regularised modelling  (train/test + separation guard)
# ──────────────────────────────────────────────────────────────────────────────
print("Building idea feature set…")

# 7.1  Feature assembly --------------------------------------------------------
n_comments = com.groupby("suggestionId").size().rename("n_comments")
btw = nx.betweenness_centrality(G_co, weight="weight", normalized=True)
deg = dict(G_co.degree(weight="weight"))

idea_df = (
    sug.merge(n_comments, left_on="suggestionId", right_index=True, how="left")
       .fillna({"n_comments": 0})
       .assign(
           author_betweenness=lambda d: d["author"].map(lambda u: btw.get(u, 0)),
           author_community  =lambda d: d["author"].map(lambda u: part_co.get(u, -1)),
           author_role       =lambda d: d["author"].map(lambda u: roles.get(u, "unknown")),
       )
)

idea_df["success"] = (idea_df["votes"] >= idea_df["votes"].quantile(0.90)).astype(int)

# ----------------------------------------------------------------------------
# 7.  Ridge‑regularised logistic model (scikit‑learn)
# ----------------------------------------------------------------------------
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing   import StandardScaler, OneHotEncoder
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline
from sklearn.linear_model    import LogisticRegression
from sklearn.metrics         import roc_auc_score

# numeric pre‑clean
for col in ["votes", "n_comments"]:
    idea_df[col] = pd.to_numeric(idea_df[col], errors="coerce").clip(lower=0).fillna(0)

idea_df["log_votes"]      = np.log1p(idea_df["votes"])
idea_df["log_n_comments"] = np.log1p(idea_df["n_comments"])
idea_df["author_betweenness_z"] = StandardScaler().fit_transform(
    idea_df[["author_betweenness"]]
)

target = idea_df["success"]
numeric = ["log_votes", "log_n_comments", "author_betweenness_z"]
categorical = ["category"]

X_train, X_test, y_train, y_test = train_test_split(
    idea_df[numeric + categorical], target,
    test_size=0.30, stratify=target, random_state=42
)

pre = ColumnTransformer(
    [("num", StandardScaler(), numeric),
     ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)]
)

logreg = LogisticRegression(
    penalty="l2", solver="liblinear", max_iter=500, class_weight="balanced"
)

pipe = Pipeline([("prep", pre), ("clf", logreg)])

param_grid = {"clf__C": [0.01, 0.1, 1, 10]}
cv = GridSearchCV(pipe, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
cv.fit(X_train, y_train)

best_model = cv.best_estimator_
train_auc  = roc_auc_score(y_train, best_model.predict_proba(X_train)[:, 1])
test_auc   = roc_auc_score(y_test,  best_model.predict_proba(X_test)[:, 1])

print(f"- Ridge Logit AUC (train) : {train_auc:.2f}")
print(f"- Ridge Logit AUC (test)  : {test_auc:.2f}")
print(f"- Best C                  : {cv.best_params_['clf__C']}")

with open("gephi_1500/logit_summary.txt", "w") as f:
    f.write("===== Ridge‑regularised Logit (sklearn) =====\n")
    f.write(f"Best C    : {cv.best_params_['clf__C']}\n")
    f.write(f"Train AUC : {train_auc:.3f}\n")
    f.write(f"Test  AUC : {test_auc:.3f}\n")


Building idea feature set…
- Ridge Logit AUC (train) : 1.00
- Ridge Logit AUC (test)  : 1.00
- Best C                  : 10


In [41]:
# --------------------------------------------------------------------------- #
# 8.  Export graphs
# --------------------------------------------------------------------------- #
print("Writing GEXF layers…")
nx.write_gexf(G_flow, "gephi_1500/comment_flow.gexf")
nx.write_gexf(G_co,   "gephi_1500/co_commenter.gexf")
nx.write_gexf(G_bip,  "gephi_1500/user_idea_bipartite.gexf")
nx.write_gexf(G_proj, "gephi_1500/suggestion_projection.gexf")

# --------------------------------------------------------------------------- #
# 9.  Final report
# --------------------------------------------------------------------------- #
print("=" * 60)
print("Pipeline complete.  Key stats")
print("- Number of users           :", G_co.number_of_nodes())
print("- Number of comments edges  :", G_flow.number_of_edges())
print(f"- Modularity (G_co)         : {Q_co:.3f}")
print(f"- Role assortativity (G_co) : {assort:.3f}")
print(f"- Ridge Logit AUC  (train)    : {train_auc:.3f}")
print(f"- Ridge Logit AUC  (test)     : {test_auc:.3f}")
print("Outputs saved to:" +"/gephi_1500")
print("=" * 60)


Writing GEXF layers…
Pipeline complete.  Key stats
- Number of users           : 9100
- Number of comments edges  : 12742
- Modularity (G_co)         : 0.750
- Role assortativity (G_co) : 0.004
- Ridge Logit AUC  (train)    : 1.000
- Ridge Logit AUC  (test)     : 0.996
Outputs saved to:/gephi_1500
