In [None]:
import networkx as nx

# Create the graph
G = nx.from_pandas_edgelist(df, "SOURCE_SUBREDDIT", "TARGET_SUBREDDIT", create_using=nx.Graph())

print(f"üîó Total nodes: {G.number_of_nodes()}")
print(f"üîó Total edges: {G.number_of_edges()}")

# Most connected nodes
degree = nx.degree_centrality(G)
top_nodes = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nüåç Most central subreddits (by connectivity):")
for name, val in top_nodes:
    print(f"{name}: {val:.3f}")

# Visualize the graph (small subset)
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
subG = G.subgraph(list(dict(top_nodes).keys()))
nx.draw(subG, with_labels=True, node_size=800, node_color="skyblue", font_size=9)
plt.show()

In [None]:
top_geo_targets = df[df["IS_TARGET_GEO"]]["TARGET_SUBREDDIT"].value_counts().head(10)
top_geo_sources = df[df["IS_SOURCE_GEO"]]["SOURCE_SUBREDDIT"].value_counts().head(10)

print("\nüåç Top 10 geographic subreddits (as TARGET):")
print(top_geo_targets)

print("\nüåç Top 10 geographic subreddits (as SOURCE):")
print(top_geo_sources)

In [None]:
import matplotlib.pyplot as plt

geo_counts = df[df["IS_TARGET_GEO"]]["TARGET_SUBREDDIT"].value_counts().head(15)

plt.figure(figsize=(10, 6))
geo_counts.plot(kind="barh", color="cornflowerblue")
plt.title("üåç Top 15 geographic subreddits most connected to economic topics")
plt.xlabel("Number of connections")
plt.ylabel("Subreddit")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate the types of relationships
df["link_type"] = df.apply(
    lambda x: "Geo ‚Üí Econ" if x["IS_SOURCE_GEO"] and not x["IS_TARGET_GEO"]
    else "Econ ‚Üí Geo" if not x["IS_SOURCE_GEO"] and x["IS_TARGET_GEO"]
    else "Geo ‚Üî Geo" if x["IS_SOURCE_GEO"] and x["IS_TARGET_GEO"]
    else "Econ ‚Üî Econ",
    axis=1
)

# Count and plot
plt.figure(figsize=(8,5))
df["link_type"].value_counts().plot(kind="bar", color="teal", alpha=0.7)
plt.title("Distribution of economic‚Äìgeographic link types")
plt.xlabel("Type of link")
plt.ylabel("Number of edges")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
top_geo_targets = df[df["IS_TARGET_GEO"]]["TARGET_SUBREDDIT"].value_counts().head(15)
top_geo_sources = df[df["IS_SOURCE_GEO"]]["SOURCE_SUBREDDIT"].value_counts().head(15)

fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Geo as TARGET
axes[0].barh(top_geo_targets.index[::-1], top_geo_targets.values[::-1], color="cornflowerblue")
axes[0].set_title("üåé Geographic subreddits (as TARGET)")
axes[0].set_xlabel("Number of connections")

# Geo as SOURCE
axes[1].barh(top_geo_sources.index[::-1], top_geo_sources.values[::-1], color="lightseagreen")
axes[1].set_title("üåç Geographic subreddits (as SOURCE)")
axes[1].set_xlabel("Number of connections")

plt.suptitle("Geographic subreddits most connected to economic topics")
plt.tight_layout()
plt.show()

In [None]:
econ_counts = (
    pd.concat([
        df[df["IS_SOURCE_GEO"]]["TARGET_SUBREDDIT"],
        df[df["IS_TARGET_GEO"]]["SOURCE_SUBREDDIT"]
    ])
    .value_counts()
    .head(15)
)

plt.figure(figsize=(10,6))
econ_counts.plot(kind="barh", color="darkorange", alpha=0.8)
plt.title("üí∞ Economic topics most connected to geographic subreddits")
plt.xlabel("Number of connections")
plt.ylabel("Economic subreddit")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
import networkx as nx
import random

# Build the graph
G = nx.from_pandas_edgelist(df, "SOURCE_SUBREDDIT", "TARGET_SUBREDDIT", create_using=nx.Graph())

# Take a subsample for readability
sample_nodes = random.sample(list(G.nodes), min(50, len(G.nodes)))
subG = G.subgraph(sample_nodes)

plt.figure(figsize=(10, 8))
nx.draw_networkx(
    subG,
    node_size=400,
    font_size=8,
    node_color="skyblue",
    edge_color="gray",
    with_labels=True
)
plt.title("üåê Economic‚Äìgeographic subnetwork (random sample)")
plt.axis("off")
plt.show()

In [None]:
import seaborn as sns

# Select the most frequent ones
top_geo = df[df["IS_TARGET_GEO"]]["TARGET_SUBREDDIT"].value_counts().head(10).index
top_econ = df[df["IS_SOURCE_GEO"] == False]["SOURCE_SUBREDDIT"].value_counts().head(10).index

# Filter the dataset
matrix = df[df["SOURCE_SUBREDDIT"].isin(top_econ) & df["TARGET_SUBREDDIT"].isin(top_geo)]
heat = matrix.groupby(["SOURCE_SUBREDDIT", "TARGET_SUBREDDIT"]).size().unstack(fill_value=0)

# Plot
plt.figure(figsize=(10,6))
sns.heatmap(heat, cmap="Blues", annot=True, fmt="d")
plt.title("üî• Connections between top economic and geographic subreddits")
plt.xlabel("Geographic (TARGET)")
plt.ylabel("Economic (SOURCE)")
plt.tight_layout()
plt.show()

In [None]:
# Top 10 geographic and economic subreddits most connected in the opposite direction
top_geo_src = df[df["IS_SOURCE_GEO"]]["SOURCE_SUBREDDIT"].value_counts().head(10).index
top_econ_tgt = df[df["IS_TARGET_GEO"] == False]["TARGET_SUBREDDIT"].value_counts().head(10).index

# Filter geographic ‚Üí economic links
matrix_g2e = df[
    (df["SOURCE_SUBREDDIT"].isin(top_geo_src)) &
    (df["TARGET_SUBREDDIT"].isin(top_econ_tgt))
]

heat_g2e = matrix_g2e.groupby(["SOURCE_SUBREDDIT", "TARGET_SUBREDDIT"]).size().unstack(fill_value=0)

plt.figure(figsize=(10,6))
sns.heatmap(heat_g2e, cmap="Greens", annot=True, fmt="d")
plt.title("üåç‚Üíüí∞ Connections from geographic subreddits to economic ones")
plt.xlabel("Economic (TARGET)")
plt.ylabel("Geographic (SOURCE)")
plt.tight_layout()
plt.show()