In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

"""
COMPLETE PYTHON VISUALIZATION SUITE
No Gephi needed - all visualizations in Python!
"""

print("=" * 70)
print("GENERATING ALL VISUALIZATIONS FOR REDDIT MEME SPREAD ANALYSIS")
print("=" * 70)

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.facecolor'] = 'white'

# Load data
print("\n📊 Loading data...")
df_edges = pd.read_csv('reddit_meme_network_edges.csv')
df_nodes = pd.read_csv('reddit_meme_network_nodes.csv')
df_detailed = pd.read_csv('reddit_interactions_detailed.csv')

# Clean data
df_edges = df_edges.dropna(subset=['user'])
df_detailed = df_detailed.dropna(subset=['user'])
df_edges['user'] = df_edges['user'].astype(str)
df_detailed['user'] = df_detailed['user'].astype(str)
df_edges = df_edges[df_edges['user'] != 'None']
df_detailed = df_detailed[df_detailed['user'] != 'None']

print(f"✅ Loaded and cleaned data")

# =============================================================================
# VISUALIZATION 1: SUBREDDIT-TO-SUBREDDIT NETWORK
# =============================================================================
print("\n" + "=" * 70)
print("1️⃣  SUBREDDIT-TO-SUBREDDIT NETWORK")
print("=" * 70)

# Load subreddit network
df_sub_network = pd.read_csv('subreddit_network.csv')

# Create network
G_subs = nx.Graph()
for _, row in df_sub_network.iterrows():
    G_subs.add_edge(row['source_subreddit'], row['target_subreddit'], 
                    weight=row['shared_users'])

# Create visualization
fig, ax = plt.subplots(figsize=(14, 10))

# Position nodes in a circle
pos = nx.circular_layout(G_subs)

# Draw edges with width based on weight
edges = G_subs.edges()
weights = [G_subs[u][v]['weight'] for u, v in edges]
max_weight = max(weights)

# Normalize weights for visualization
edge_widths = [5 + (w / max_weight) * 15 for w in weights]

nx.draw_networkx_edges(G_subs, pos, width=edge_widths, alpha=0.6, edge_color='gray')

# Draw nodes
node_sizes = [3000] * len(G_subs.nodes())
nx.draw_networkx_nodes(G_subs, pos, node_size=node_sizes, 
                       node_color='#FF6B6B', alpha=0.9, 
                       edgecolors='black', linewidths=2)

# Draw labels
nx.draw_networkx_labels(G_subs, pos, font_size=12, font_weight='bold')

# Add edge labels (shared users count)
edge_labels = nx.get_edge_attributes(G_subs, 'weight')
edge_labels = {k: f"{v} users" for k, v in edge_labels.items()}
nx.draw_networkx_edge_labels(G_subs, pos, edge_labels, font_size=9)

plt.title('Subreddit-to-Subreddit Network\n(Connected by Shared Users)', 
          fontsize=16, fontweight='bold', pad=20)
plt.axis('off')
plt.tight_layout()
plt.savefig('viz_subreddit_network.png', dpi=300, bbox_inches='tight')
print("✅ Saved: viz_subreddit_network.png")
plt.close()

# =============================================================================
# VISUALIZATION 2: TOP SUPER-SPREADERS BAR CHART
# =============================================================================
print("\n" + "=" * 70)
print("2️⃣  TOP SUPER-SPREADERS BAR CHART")
print("=" * 70)

# Load super-spreaders
df_super = pd.read_csv('super_spreaders.csv')
top_10 = df_super.head(10)

fig, ax = plt.subplots(figsize=(12, 6))

# Create bar chart
bars = ax.barh(range(len(top_10)), top_10['subreddit_count'], color='#4ECDC4')

# Add value labels on bars
for i, (idx, row) in enumerate(top_10.iterrows()):
    ax.text(row['subreddit_count'] + 0.05, i, f"{int(row['subreddit_count'])} subreddits", 
            va='center', fontsize=10, fontweight='bold')

# Set y-axis labels
ax.set_yticks(range(len(top_10)))
ax.set_yticklabels(top_10['user'].str[:20], fontsize=10)  # Truncate long usernames

ax.set_xlabel('Number of Subreddits Engaged', fontsize=12, fontweight='bold')
ax.set_ylabel('User', fontsize=12, fontweight='bold')
ax.set_title('Top 10 Super-Spreaders\n(Users Bridging Multiple Communities)', 
             fontsize=14, fontweight='bold', pad=15)

# Add grid
ax.grid(axis='x', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)

# Invert y-axis so #1 is at top
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('viz_super_spreaders.png', dpi=300, bbox_inches='tight')
print("✅ Saved: viz_super_spreaders.png")
plt.close()

# =============================================================================
# VISUALIZATION 3: ENGAGEMENT QUALITY BY SUBREDDIT
# =============================================================================
print("\n" + "=" * 70)
print("3️⃣  ENGAGEMENT QUALITY BY SUBREDDIT")
print("=" * 70)

# Calculate engagement metrics
engagement_stats = df_detailed.groupby('target_subreddit')['comment_score'].agg([
    ('avg_score', 'mean'),
    ('median_score', 'median'),
    ('max_score', 'max'),
    ('total_interactions', 'count')
]).reset_index()

engagement_stats = engagement_stats.sort_values('avg_score', ascending=False)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Chart 1: Average Engagement Score
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
bars1 = ax1.bar(range(len(engagement_stats)), engagement_stats['avg_score'], 
                color=colors, edgecolor='black', linewidth=1.5)

ax1.set_xticks(range(len(engagement_stats)))
ax1.set_xticklabels([f"r/{sub}" for sub in engagement_stats['target_subreddit']], 
                     rotation=45, ha='right', fontsize=10)
ax1.set_ylabel('Average Comment Score', fontsize=11, fontweight='bold')
ax1.set_title('Average Engagement Quality by Subreddit', fontsize=13, fontweight='bold')
ax1.grid(axis='y', alpha=0.3, linestyle='--')

# Add value labels
for i, (idx, row) in enumerate(engagement_stats.iterrows()):
    ax1.text(i, row['avg_score'] + 10, f"{row['avg_score']:.0f}", 
             ha='center', va='bottom', fontsize=9, fontweight='bold')

# Chart 2: Total Interactions
bars2 = ax2.bar(range(len(engagement_stats)), engagement_stats['total_interactions'], 
                color=colors, edgecolor='black', linewidth=1.5)

ax2.set_xticks(range(len(engagement_stats)))
ax2.set_xticklabels([f"r/{sub}" for sub in engagement_stats['target_subreddit']], 
                     rotation=45, ha='right', fontsize=10)
ax2.set_ylabel('Total Interactions', fontsize=11, fontweight='bold')
ax2.set_title('Activity Level by Subreddit', fontsize=13, fontweight='bold')
ax2.grid(axis='y', alpha=0.3, linestyle='--')

# Add value labels
for i, (idx, row) in enumerate(engagement_stats.iterrows()):
    ax2.text(i, row['total_interactions'] + 5, f"{int(row['total_interactions'])}", 
             ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('viz_engagement_quality.png', dpi=300, bbox_inches='tight')
print("✅ Saved: viz_engagement_quality.png")
plt.close()

# =============================================================================
# VISUALIZATION 4: USER-SUBREDDIT NETWORK (FULL NETWORK)
# =============================================================================
print("\n" + "=" * 70)
print("4️⃣  USER-SUBREDDIT BIPARTITE NETWORK")
print("=" * 70)

# Build full network
G = nx.Graph()
for _, row in df_edges.iterrows():
    G.add_edge(row['user'], row['target_subreddit'], weight=row['weight'])

# Identify node types
subreddit_nodes = df_detailed['target_subreddit'].unique()
user_nodes = [n for n in G.nodes() if n not in subreddit_nodes]

fig, ax = plt.subplots(figsize=(16, 14))

# Use spring layout for better visualization
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)

# Draw user nodes (small, blue)
nx.draw_networkx_nodes(G, pos, nodelist=user_nodes, 
                       node_size=20, node_color='#4ECDC4', 
                       alpha=0.6, ax=ax)

# Draw subreddit nodes (large, red)
nx.draw_networkx_nodes(G, pos, nodelist=subreddit_nodes, 
                       node_size=3000, node_color='#FF6B6B', 
                       alpha=0.9, edgecolors='black', linewidths=2, ax=ax)

# Draw edges
nx.draw_networkx_edges(G, pos, alpha=0.1, width=0.5, ax=ax)

# Draw labels only for subreddits
subreddit_labels = {node: node for node in subreddit_nodes}
nx.draw_networkx_labels(G, pos, labels=subreddit_labels, 
                       font_size=12, font_weight='bold', ax=ax)

plt.title('User-Subreddit Network\n(Red = Subreddits, Blue = Users)', 
          fontsize=16, fontweight='bold', pad=20)
plt.axis('off')
plt.tight_layout()
plt.savefig('viz_full_network.png', dpi=300, bbox_inches='tight')
print("✅ Saved: viz_full_network.png")
plt.close()

# =============================================================================
# VISUALIZATION 5: NETWORK METRICS DASHBOARD
# =============================================================================
print("\n" + "=" * 70)
print("5️⃣  NETWORK METRICS DASHBOARD")
print("=" * 70)

# Calculate metrics
total_nodes = G.number_of_nodes()
total_edges = G.number_of_edges()
density = nx.density(G)
avg_degree = sum(dict(G.degree()).values()) / G.number_of_nodes()
num_super_spreaders = len(df_super)
peak_hour = df_detailed.groupby(pd.to_datetime(df_detailed['timestamp']).dt.hour).size().idxmax()

# Get degree distribution
degrees = [d for n, d in G.degree()]
avg_clustering = nx.average_clustering(G)

# Create dashboard
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 3, hspace=0.4, wspace=0.3)

# Metric 1: Network Size
ax1 = fig.add_subplot(gs[0, 0])
ax1.text(0.5, 0.6, str(total_nodes), ha='center', va='center', 
         fontsize=48, fontweight='bold', color='#FF6B6B')
ax1.text(0.5, 0.25, 'Total Nodes', ha='center', va='center', 
         fontsize=14, color='gray')
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)
ax1.axis('off')

# Metric 2: Total Edges
ax2 = fig.add_subplot(gs[0, 1])
ax2.text(0.5, 0.6, str(total_edges), ha='center', va='center', 
         fontsize=48, fontweight='bold', color='#4ECDC4')
ax2.text(0.5, 0.25, 'Total Edges', ha='center', va='center', 
         fontsize=14, color='gray')
ax2.set_xlim(0, 1)
ax2.set_ylim(0, 1)
ax2.axis('off')

# Metric 3: Network Density
ax3 = fig.add_subplot(gs[0, 2])
ax3.text(0.5, 0.6, f'{density:.4f}', ha='center', va='center', 
         fontsize=48, fontweight='bold', color='#45B7D1')
ax3.text(0.5, 0.25, 'Network Density', ha='center', va='center', 
         fontsize=14, color='gray')
ax3.set_xlim(0, 1)
ax3.set_ylim(0, 1)
ax3.axis('off')

# Metric 4: Super-Spreaders
ax4 = fig.add_subplot(gs[1, 0])
ax4.text(0.5, 0.6, str(num_super_spreaders), ha='center', va='center', 
         fontsize=48, fontweight='bold', color='#FFA07A')
ax4.text(0.5, 0.25, 'Super-Spreaders', ha='center', va='center', 
         fontsize=14, color='gray')
ax4.set_xlim(0, 1)
ax4.set_ylim(0, 1)
ax4.axis('off')

# Metric 5: Peak Hour
ax5 = fig.add_subplot(gs[1, 1])
ax5.text(0.5, 0.6, f'{peak_hour}:00', ha='center', va='center', 
         fontsize=48, fontweight='bold', color='#98D8C8')
ax5.text(0.5, 0.25, 'Peak Activity (UTC)', ha='center', va='center', 
         fontsize=14, color='gray')
ax5.set_xlim(0, 1)
ax5.set_ylim(0, 1)
ax5.axis('off')

# Metric 6: Average Degree
ax6 = fig.add_subplot(gs[1, 2])
ax6.text(0.5, 0.6, f'{avg_degree:.2f}', ha='center', va='center', 
         fontsize=48, fontweight='bold', color='#B19CD9')
ax6.text(0.5, 0.25, 'Average Degree', ha='center', va='center', 
         fontsize=14, color='gray')
ax6.set_xlim(0, 1)
ax6.set_ylim(0, 1)
ax6.axis('off')

# Chart 1: Degree Distribution
ax7 = fig.add_subplot(gs[2, :2])
ax7.hist(degrees, bins=50, color='#4ECDC4', edgecolor='black', alpha=0.7)
ax7.set_xlabel('Node Degree', fontsize=11, fontweight='bold')
ax7.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax7.set_title('Degree Distribution (Power Law)', fontsize=12, fontweight='bold')
ax7.grid(axis='y', alpha=0.3, linestyle='--')

# Chart 2: Top Nodes
ax8 = fig.add_subplot(gs[2, 2])
top_nodes = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:5]
names = [n[0] if n[0] in subreddit_nodes else n[0][:10] for n in top_nodes]
values = [n[1] for n in top_nodes]
ax8.barh(range(len(names)), values, color='#FF6B6B', edgecolor='black')
ax8.set_yticks(range(len(names)))
ax8.set_yticklabels(names, fontsize=9)
ax8.set_xlabel('Connections', fontsize=10, fontweight='bold')
ax8.set_title('Top 5 Most Connected', fontsize=11, fontweight='bold')
ax8.invert_yaxis()

fig.suptitle('Network Metrics Dashboard', fontsize=18, fontweight='bold', y=0.98)
plt.savefig('viz_network_metrics.png', dpi=300, bbox_inches='tight')
print("✅ Saved: viz_network_metrics.png")
plt.close()

# =============================================================================
# SUMMARY
# =============================================================================
print("\n" + "=" * 70)
print("✅ ALL VISUALIZATIONS GENERATED SUCCESSFULLY!")
print("=" * 70)
print("\nFiles created:")
print("  1. viz_subreddit_network.png - Subreddit-to-subreddit connections")
print("  2. viz_super_spreaders.png - Top 10 bridge users")
print("  3. viz_engagement_quality.png - Engagement quality by community")
print("  4. viz_full_network.png - Complete user-subreddit network")
print("  5. viz_network_metrics.png - Network statistics dashboard")
print("\n📊 You also have from previous run:")
print("  6. meme_velocity_by_hour.png - Temporal spread patterns")
print("  7. subreddit_overlap_heatmap.png - Community overlap matrix")
print("\n🎉 All visualizations ready for your report!")
print("=" * 70)

GENERATING ALL VISUALIZATIONS FOR REDDIT MEME SPREAD ANALYSIS

📊 Loading data...
✅ Loaded and cleaned data

1️⃣  SUBREDDIT-TO-SUBREDDIT NETWORK
✅ Saved: viz_subreddit_network.png

2️⃣  TOP SUPER-SPREADERS BAR CHART
✅ Saved: viz_super_spreaders.png

3️⃣  ENGAGEMENT QUALITY BY SUBREDDIT
✅ Saved: viz_engagement_quality.png

4️⃣  USER-SUBREDDIT BIPARTITE NETWORK
✅ Saved: viz_full_network.png

5️⃣  NETWORK METRICS DASHBOARD
✅ Saved: viz_network_metrics.png

✅ ALL VISUALIZATIONS GENERATED SUCCESSFULLY!

Files created:
  1. viz_subreddit_network.png - Subreddit-to-subreddit connections
  2. viz_super_spreaders.png - Top 10 bridge users
  3. viz_engagement_quality.png - Engagement quality by community
  4. viz_full_network.png - Complete user-subreddit network
  5. viz_network_metrics.png - Network statistics dashboard

📊 You also have from previous run:
  6. meme_velocity_by_hour.png - Temporal spread patterns
  7. subreddit_overlap_heatmap.png - Community overlap matrix

🎉 All visualizations