In [None]:
# GPU-Accelerated Information Spread on Twitter Ego Network

This notebook demonstrates GPU-accelerated network analysis and information spread simulation using RAPIDS cuGraph and Graphistry. It is designed for reproducibility on any compatible environment (e.g., RAPIDS Docker, Colab, or local GPU setup).

## Steps:
- Environment setup (install dependencies)
- Load and preprocess the Twitter Ego Network dataset
- Analyze network metrics (nodes, edges, degree, components)
- Visualize the network with Graphistry
- Simulate information spread using different seeding strategies
- Compare and plot results

### Requirements
- NVIDIA GPU with CUDA support
- Python 3.8+
- RAPIDS cuGraph, cuDF, Graphistry, pandas, seaborn, matplotlib

----

!nvidia-smi

Sat May 10 16:14:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Install required packages (run once per environment)
!pip install --upgrade pip
!pip install cudf-cu11 cugraph-cu11 cupy-cuda11x graphistry pandas matplotlib seaborn --extra-index-url=https://pypi.nvidia.com

fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.12 environment at: /usr
Audited 11 packages in 190ms

        ***********************************************************************
        The pip install of RAPIDS is complete.

        Please do not run any further installation from the conda based installation methods, as they may cause issues!

        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files.
            - If an installation failure persists when using the latest script, please make an issue on https://github.com/rapidsai-community/rapidsai-csp-utils
        ***********************************************************************
        


In [None]:
!pip install graphistry --upgrade



In [None]:
# Graphistry setup (replace with your own API key)
import graphistry
import os

# Set your Graphistry credentials here (do not commit secrets to GitHub)
GRAPHISTRY_API_KEY = os.getenv('GRAPHISTRY_API_KEY', 'YOUR_API_KEY_HERE')
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", key=GRAPHISTRY_API_KEY)
print(f"Graphistry version: {graphistry.__version__}")

0.36.1




In [None]:
!pip install cugraph

Collecting cugraph
  Using cached cugraph-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: cugraph
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for cugraph (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for cugraph[0m[31m
[0m[?25h  Running setup.py clean for cugraph
Failed to build cugraph
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (cugraph)[0m[31m
[0m

[31mERROR: Could not find a version that satisfies the requirement cupy.complex (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cupy.complex[0m[31m
[0m

MessageError: Error: credential propagation was unsuccessful

In [None]:
!sudo apt-get update


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,720 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,926 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,901 kB]
Get:13 http://archive.ubuntu.com/ubuntu jam

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
# --- Imports and Utility Functions ---
import cudf
import cugraph
import graphistry
import random
import time
import pandas as pd
from IPython.display import display

# 1. Load Twitter Ego Network using cuDF and cuGraph
def load_twitter_ego_network_gpu(file_path):
    """Load and preprocess the Twitter Ego Network dataset."""
    try:
        gdf = cudf.read_csv(
            file_path,
            sep=' ',
            names=['source', 'target'],
            dtype='int32',
            header=None
        )
        # Remove duplicates and self-loops
        gdf = gdf[gdf['source'] != gdf['target']]
        gdf = gdf.drop_duplicates()
        print(f"Loaded: {len(gdf)} edges")
        return gdf
    except Exception as e:
        print(f"GPU load error: {e}")
        return None

# 2. Analyze network metrics with cuGraph
def analyze_network_metrics_cugraph(gdf):
    """Compute basic network metrics using cuGraph."""
    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(gdf, source='source', destination='target')
    metrics = {}
    metrics['num_nodes'] = G.number_of_vertices()
    metrics['num_edges'] = G.number_of_edges()
    degree_df = G.degree()
    metrics['avg_degree'] = float(degree_df['degree'].mean())
    metrics['max_degree'] = int(degree_df['degree'].max())
    top_degree = degree_df.sort_values('degree', ascending=False).head(5)
    metrics['top_degree_nodes'] = top_degree
    # Connected components (undirected)
    G_undirected = cugraph.Graph(directed=False)
    G_undirected.from_cudf_edgelist(gdf, source='source', destination='target')
    components = cugraph.weakly_connected_components(G_undirected)
    metrics['num_components'] = int(components['labels'].nunique())
    metrics['largest_component_size'] = int(components['labels'].value_counts().max())
    return metrics

# 3. Visualize with Graphistry
def visualize_with_graphistry(gdf, degree_df=None):
    """Visualize the network using Graphistry, optionally encoding node size by degree."""
    graphistry.register(api=3)
    g = graphistry.edges(gdf, 'source', 'target')
    if degree_df is not None:
        min_size, max_size = 10, 100
        degree_df = degree_df.copy()
        degree_df['degree'] = degree_df['degree'].astype(float)
        deg_min = degree_df['degree'].min()
        deg_max = degree_df['degree'].max()
        if deg_max > deg_min:
            degree_df['size'] = degree_df['degree'].apply(
                lambda d: min_size + (max_size - min_size) * ((d - deg_min) / (deg_max - deg_min))
            )
        else:
            degree_df['size'] = (min_size + max_size) / 2
        g = g.nodes(degree_df, 'vertex').bind(point_size='size')
    return g.plot()

# 4. Information Spread Simulation
def simulate_information_spread_gpu(gdf, seed_nodes=None, beta=0.1, max_iterations=100):
    """Simulate information spread using a simple contagion model."""
    start_time = time.perf_counter()
    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(gdf, source='source', destination='target')
    if seed_nodes is None:
        degree_df = G.degree()
        seed_nodes = [int(degree_df.sort_values('degree', ascending=False).head(1)['vertex'].iloc[0])]
    elif isinstance(seed_nodes, int):
        seed_nodes = [seed_nodes]
    infected = set(int(n) for n in seed_nodes)
    events = []
    edge_dict = gdf.to_pandas().groupby('source')['target'].apply(list).to_dict()
    for t in range(1, max_iterations + 1):
        new_infected = set()
        for node in infected:
            neighbors = edge_dict.get(int(node), [])
            for neighbor in neighbors:
                neighbor = int(neighbor)
                if neighbor not in infected and random.random() < beta:
                    new_infected.add(neighbor)
                    events.append({'step': t, 'informer': int(node), 'informed': neighbor})
        if not new_infected:
            break
        infected.update(new_infected)
    end_time = time.perf_counter()
    runtime_sec = end_time - start_time
    events_df = cudf.DataFrame(events)
    return infected, events_df, runtime_sec

# 5. Seeding Strategies
def get_top_degree_nodes(gdf, k=5):
    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(gdf, source='source', destination='target')
    degree_df = G.degree()
    return degree_df.sort_values('degree', ascending=False).head(k)['vertex'].to_pandas().tolist()

def get_top_pagerank_nodes(gdf, k=5):
    G = cugraph.Graph(directed=True)
    G.from_cudf_edgelist(gdf, source='source', destination='target', store_transposed=True)
    pagerank_df = cugraph.pagerank(G)
    return pagerank_df.sort_values('pagerank', ascending=False).head(k)['vertex'].to_pandas().tolist()

def get_community_seed_nodes_gpu(gdf, k=5):
    G_undirected = cugraph.Graph(directed=False)
    G_undirected.from_cudf_edgelist(gdf, source='source', destination='target')
    components = cugraph.weakly_connected_components(G_undirected)
    G_directed = cugraph.Graph(directed=True)
    G_directed.from_cudf_edgelist(gdf, source='source', destination='target')
    pagerank_df = cugraph.pagerank(G_directed)
    out_deg_df = G_directed.degree()``
    merged = components.merge(pagerank_df, on='vertex')
    merged = merged.merge(out_deg_df, on='vertex')
    merged = merged[merged['degree'] > 0]
    seeds = set()
    component_sizes = merged['labels'].value_counts().to_pandas().sort_values(ascending=False)
    for label in component_sizes.index:
        sub = merged[merged['labels'] == label]
        if not sub.empty:
            top_node = sub.sort_values('pagerank', ascending=False)['vertex'].iloc[0]
            seeds.add(int(top_node))
            if len(seeds) == k:
                break
    if len(seeds) < k:
        for node in merged.sort_values('pagerank', ascending=False)['vertex'].to_pandas():
            if int(node) not in seeds:
                seeds.add(int(node))
                if len(seeds) == k:
                    break
    return list(seeds)

def compare_seeding_strategies_gpu(gdf, k=5, beta=0.1, max_iterations=100):
    """Compare random, degree, pagerank, and community-based seeding strategies."""
    all_nodes = cudf.concat([gdf['source'], gdf['target']]).unique().to_pandas().tolist()
    random_seeds = random.sample(all_nodes, k)
    degree_seeds = get_top_degree_nodes(gdf, k)
    pagerank_seeds = get_top_pagerank_nodes(gdf, k)
    community_seeds = get_community_seed_nodes_gpu(gdf, k)
    infected_rand, events_rand, runtime_rand = simulate_information_spread_gpu(gdf, random_seeds, beta, max_iterations)
    iter_rand = int(events_rand['step'].max()) if len(events_rand) > 0 else 0
    infected_deg, events_deg, runtime_deg = simulate_information_spread_gpu(gdf, degree_seeds, beta, max_iterations)
    iter_deg = int(events_deg['step'].max()) if len(events_deg) > 0 else 0
    infected_pr, events_pr, runtime_pr = simulate_information_spread_gpu(gdf, pagerank_seeds, beta, max_iterations)
    iter_pr = int(events_pr['step'].max()) if len(events_pr) > 0 else 0
    infected_comm, events_comm, runtime_comm = simulate_information_spread_gpu(gdf, community_seeds, beta, max_iterations)
    iter_comm = int(events_comm['step'].max()) if len(events_comm) > 0 else 0
    return {
        'random_strategy': {
            'seeds': random_seeds,
            'infected_count': len(infected_rand),
            'iterations': iter_rand,
            'runtime_sec': round(float(runtime_rand), 4)
        },
        'degree_strategy': {
            'seeds': degree_seeds,
            'infected_count': len(infected_deg),
            'iterations': iter_deg,
            'runtime_sec': round(float(runtime_deg), 4)
        },
        'pagerank_strategy': {
            'seeds': pagerank_seeds,
            'infected_count': len(infected_pr),
            'iterations': iter_pr,
            'runtime_sec': round(float(runtime_pr), 4)
        },
        'community_strategy': {
            'seeds': community_seeds,
            'infected_count': len(infected_comm),
            'iterations': iter_comm,
            'runtime_sec': round(float(runtime_comm), 4)
        }
    }


Loaded: 1768135 edges

 Network Metrics:
num_nodes: 81306
num_edges: 1768135
avg_degree: 43.49334612451725
max_degree: 3758
top_degree_nodes:        degree     vertex
2        3758     813286
67940    3384  115485051
2104     3335   40981798
1        3063    3359851
2693     2840   43003845
num_components: 1
largest_component_size: 81306


→ Getting Random-k Seeds (Baseline)...
→ Getting Top-k Degree seeds (Main Strategy)...
→ Getting Top-k PageRank seeds (Bonus)...
→ Getting Community-Based PageRank seeds (Bonus)...




→ Simulating spread from Random seeds...
[Benchmark] Simulation runtime: 22.1623 seconds
→ Simulating spread from Degree seeds...
[Benchmark] Simulation runtime: 21.3033 seconds
→ Simulating spread from PageRank seeds...
[Benchmark] Simulation runtime: 26.4952 seconds
→ Simulating spread from Community PageRank seeds...
[Benchmark] Simulation runtime: 28.3642 seconds

Seeding Strategy Comparison Results
Random Seeds: [15438005, 552110505, 166231634]
→ Nodes Infected:                81293
→ Iterations Until Convergence: 72
→ Simulation Runtime (sec):     22.1623
-----------------------------------------------------------------
Degree Centrality Seeds: [813286, 115485051, 40981798]
→ Nodes Infected:                81277
→ Iterations Until Convergence: 65
→ Simulation Runtime (sec):     21.3033
-----------------------------------------------------------------
PageRank Seeds: [115485051, 116485573, 813286]
→ Nodes Infected:                81292
→ Iterations Until Convergence: 78
→ Simulati

In [None]:
# --- Experiment: Compare Seeding Strategies for Different k ---
import seaborn as sns
import matplotlib.pyplot as plt

results = []
k_values = [1, 2, 3, 5, 10, 20, 50, 100]

for k in k_values:
    metrics = compare_seeding_strategies_gpu(gdf, k, beta=0.1)
    for strategy, data in metrics.items():
        results.append({
            'strategy': strategy,
            'k': k,
            'infected_nodes': data['infected_count'],
            'iterations': data['iterations'],
            'time': data['runtime_sec']
        })

df_results = pd.DataFrame(results)

# Plot: Infected Nodes vs Seed Count
sns.lineplot(data=df_results, x='k', y='infected_nodes', hue='strategy', marker='o')
plt.title("Infected Nodes vs Seed Count (k)")
plt.show()

# Plot: Iterations vs Seed Count
sns.lineplot(data=df_results, x='k', y='iterations', hue='strategy', marker='o')
plt.title("Iterations vs Seed Count (k)")
plt.show()

# Plot: Simulation Time vs Seed Count
sns.lineplot(data=df_results, x='k', y='time', hue='strategy', marker='o')
plt.title("Simulation Time vs Seed Count (k)")
plt.show()


NameError: name 'graph' is not defined

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def generate_spread_analysis_plots(df_results, graph):
    """
    Generate multiple plots to analyze and compare seeding strategies.

    Parameters:
        df_results (pd.DataFrame): Contains columns - strategy, k, infected_nodes, iterations, time
        graph (nx.Graph or cugraph.Graph): The graph used for simulations, used to compute % infection

    Returns:
        None (displays plots)
    """
    sns.set(style="whitegrid")

    # Derived metrics
    df_results = df_results.copy()
    df_results['infected_per_iter'] = df_results['infected_nodes'] / df_results['iterations']
    df_results['time_per_iter'] = df_results['time'] / df_results['iterations']
    df_results['infected_pct'] = df_results['infected_nodes'] / graph.number_of_nodes()
    df_results['infected_per_sec'] = df_results['infected_nodes'] / df_results['time']

    # 1. Infected Nodes vs k
    plt.figure(figsize=(8, 5))
    sns.lineplot(data=df_results, x='k', y='infected_nodes', hue='strategy', marker='o')
    plt.title("Infected Nodes vs k")
    plt.ylabel("Total Infected Nodes")
    plt.xlabel("Seed Count (k)")
    plt.tight_layout()
    plt.show()

    # 2. Infection Efficiency (Infected per Iteration)
    plt.figure(figsize=(8, 5))
    sns.lineplot(data=df_results, x='k', y='infected_per_iter', hue='strategy', marker='o')
    plt.title("Infection Efficiency (Infected per Iteration)")
    plt.ylabel("Infected / Iteration")
    plt.xlabel("Seed Count (k)")
    plt.tight_layout()
    plt.show()

    # 3. Time per Iteration
    plt.figure(figsize=(8, 5))
    sns.lineplot(data=df_results, x='k', y='time_per_iter', hue='strategy', marker='o')
    plt.title("Time per Iteration")
    plt.ylabel("Time / Iteration (s)")
    plt.xlabel("Seed Count (k)")
    plt.tight_layout()
    plt.show()

    # 4. Spread Effectiveness vs Runtime
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=df_results, x='time', y='infected_nodes', hue='strategy', style='k')
    plt.title("Spread Effectiveness vs Runtime")
    plt.xlabel("Simulation Time (s)")
    plt.ylabel("Infected Nodes")
    plt.tight_layout()
    plt.show()

    # 5. Percentage of Graph Infected
    plt.figure(figsize=(8, 5))
    sns.lineplot(data=df_results, x='k', y='infected_pct', hue='strategy', marker='o')
    plt.title("Percentage of Graph Infected")
    plt.ylabel("Infected / Total Nodes")
    plt.xlabel("Seed Count (k)")
    plt.tight_layout()
    plt.show()

    # 6. Infected per Second
    plt.figure(figsize=(8, 5))
    sns.lineplot(data=df_results, x='k', y='infected_per_sec', hue='strategy', marker='o')
    plt.title("Infected Nodes per Second")
    plt.ylabel("Infected / Second")
    plt.xlabel("Seed Count (k)")
    plt.tight_layout()
    plt.show()


## Instructions for Replication and Citation

**To replicate this notebook:**
1. Ensure you have an NVIDIA GPU and the required drivers/CUDA installed.
2. Install dependencies using the provided pip cell.
3. Download the dataset (`twitter_combined.txt`) and place it in the same directory as this notebook.
4. Set your Graphistry API key as an environment variable or directly in the notebook.
5. Run all cells in order.

**Citation:**
If you use this notebook or code in your work, please cite the original dataset and acknowledge the use of RAPIDS cuGraph and Graphistry.