# SIR Spreading on Similarity Network

Measure node influence by final outbreak size in weighted similarity network.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Network

In [None]:
# Load and build network
interactions = pd.read_csv('../data/raw/small_matrix.csv')

user_video_matrix = interactions.pivot_table(
    index='user_id',
    columns='video_id',
    values='watch_ratio',
    fill_value=0
)

similarity_matrix = cosine_similarity(user_video_matrix.values)
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=user_video_matrix.index,
    columns=user_video_matrix.index
)

G = nx.Graph()
users = user_video_matrix.index.tolist()
G.add_nodes_from(users)

for i, user_i in enumerate(users):
    for j, user_j in enumerate(users[i+1:], start=i+1):
        sim = similarity_df.loc[user_i, user_j]
        G.add_edge(user_i, user_j, weight=sim)

print(f"Network: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")
print(f"Mean similarity: {np.mean([d['weight'] for u, v, d in G.edges(data=True)]):.3f}")

## 2. SIR Model

- Transmission: β × edge weight (similarity)
- Recovery: γ (fixed probability per time step)

In [None]:
def run_sir(G, seed, beta, gamma, max_steps=100):
    """Run SIR simulation, return S, I, R counts over time."""
    infected = {seed}
    recovered = set()
    
    S_counts = [G.number_of_nodes() - 1]
    I_counts = [1]
    R_counts = [0]
    
    for t in range(max_steps):
        # Recovery
        newly_recovered = set()
        for node in infected:
            if np.random.random() < gamma:
                newly_recovered.add(node)
        
        infected -= newly_recovered
        recovered.update(newly_recovered)
        
        # Infection
        new_infected = set()
        for node in infected:
            for neighbor in G.neighbors(node):
                if neighbor not in infected and neighbor not in recovered:
                    if np.random.random() < beta * G[node][neighbor]['weight']:
                        new_infected.add(neighbor)
        
        infected.update(new_infected)
        
        # Track
        S_counts.append(G.number_of_nodes() - len(infected) - len(recovered))
        I_counts.append(len(infected))
        R_counts.append(len(recovered))
        
        # Stop if no infected
        if len(infected) == 0:
            break
    
    return S_counts, I_counts, R_counts


def measure_influence(G, seed, beta, gamma, num_runs=30):
    """Measure outbreak metrics for a seed node."""
    final_outbreak_sizes = []
    peak_infections = []
    epidemic_durations = []
    
    for _ in range(num_runs):
        S, I, R = run_sir(G, seed, beta, gamma)
        
        # Final outbreak size (total recovered at end)
        final_outbreak_sizes.append(R[-1])
        
        # Peak infection
        peak_infections.append(max(I))
        
        # Duration (when epidemic ended)
        epidemic_durations.append(len(I))
    
    return {
        'final_outbreak': np.mean(final_outbreak_sizes),
        'std_outbreak': np.std(final_outbreak_sizes),
        'peak_infection': np.mean(peak_infections),
        'duration': np.mean(epidemic_durations),
        'attack_rate': np.mean(final_outbreak_sizes) / G.number_of_nodes(),
    }

## 3. Test Seeds

In [None]:
# Compute strength
strength = dict(G.degree(weight='weight'))
pagerank = nx.pagerank(G, weight='weight')

# Select test seeds
top_strength = sorted(strength.items(), key=lambda x: x[1], reverse=True)
bottom_strength = sorted(strength.items(), key=lambda x: x[1])

test_seeds = {
    'High Strength': top_strength[0][0],
    'Low Strength': bottom_strength[0][0],
    'Medium Strength': top_strength[len(top_strength)//2][0],
    'Random': np.random.choice(users),
}

print("Test seeds:")
for label, node in test_seeds.items():
    print(f"  {label:15s}: User {node:5d} (strength={strength[node]:.1f})")

## 4. Run Test Simulations

In [None]:
BETA = 0.3
GAMMA = 0.1
NUM_RUNS = 50

print(f"Running SIR simulations (β={BETA}, γ={GAMMA}, {NUM_RUNS} runs per seed)...\n")

test_results = {}
for label, seed in test_seeds.items():
    metrics = measure_influence(G, seed, BETA, GAMMA, NUM_RUNS)
    test_results[label] = metrics
    print(f"{label:15s}: outbreak={metrics['final_outbreak']:6.1f} ({metrics['attack_rate']*100:4.1f}%), peak={metrics['peak_infection']:5.1f}")

print("\nObservation: Does high strength → larger outbreaks?")

## 5. Visualize SIR Curves

In [None]:
# Plot S, I, R curves for high vs low strength
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, (label, seed) in enumerate([('High Strength', test_seeds['High Strength']), 
                                      ('Low Strength', test_seeds['Low Strength'])]):
    ax = axes[idx]
    
    # Run multiple simulations
    all_S, all_I, all_R = [], [], []
    for _ in range(30):
        S, I, R = run_sir(G, seed, BETA, GAMMA)
        all_S.append(S)
        all_I.append(I)
        all_R.append(R)
    
    # Average curves
    max_len = max(len(s) for s in all_S)
    S_padded = [s + [s[-1]] * (max_len - len(s)) for s in all_S]
    I_padded = [i + [i[-1]] * (max_len - len(i)) for i in all_I]
    R_padded = [r + [r[-1]] * (max_len - len(r)) for r in all_R]
    
    times = np.arange(max_len)
    ax.plot(times, np.mean(S_padded, axis=0), label='S (Susceptible)', linewidth=2)
    ax.plot(times, np.mean(I_padded, axis=0), label='I (Infected)', linewidth=2)
    ax.plot(times, np.mean(R_padded, axis=0), label='R (Recovered)', linewidth=2)
    
    ax.set_xlabel('Time Steps')
    ax.set_ylabel('Number of Nodes')
    ax.set_title(f'SIR Dynamics: {label}')
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Measure All Nodes

In [None]:
NUM_RUNS_PER_NODE = 20

print(f"Measuring influence for all {len(users)} nodes...")
print(f"β={BETA}, γ={GAMMA}, {NUM_RUNS_PER_NODE} runs per node (5-10 min)\n")

influence = {}
for node in tqdm(users):
    metrics = measure_influence(G, node, BETA, GAMMA, NUM_RUNS_PER_NODE)
    influence[node] = {
        **metrics,
        'strength': strength[node],
        'pagerank': pagerank[node],
    }

# Convert to DataFrame
df = pd.DataFrame.from_dict(influence, orient='index')
df.index.name = 'user_id'
df = df.sort_values('final_outbreak', ascending=False)

print("\nTop 10 most influential (largest outbreaks):")
print(df.head(10)[['final_outbreak', 'attack_rate', 'peak_infection', 'strength']].to_string())

print("\nBottom 10 least influential:")
print(df.tail(10)[['final_outbreak', 'attack_rate', 'peak_infection', 'strength']].to_string())

## 7. Correlation Analysis

In [None]:
# Correlations
corr_strength = df['final_outbreak'].corr(df['strength'])
corr_pagerank = df['final_outbreak'].corr(df['pagerank'])
corr_peak = df['peak_infection'].corr(df['strength'])

print("Correlation with outbreak size:")
print("="*60)
print(f"Final outbreak vs Strength:  {corr_strength:7.4f}")
print(f"Final outbreak vs PageRank:  {corr_pagerank:7.4f}")
print(f"Peak infection vs Strength:  {corr_peak:7.4f}")
print()
print("Interpretation:")
if corr_strength > 0.7:
    print("  ✓ Strength STRONGLY predicts outbreak size")
elif corr_strength > 0.3:
    print("  → Strength moderately predicts outbreak size")
else:
    print("  ✗ Strength weakly predicts outbreak size")

In [None]:
# Scatter plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(df['strength'], df['final_outbreak'], alpha=0.5, s=30)
axes[0].set_xlabel('Strength')
axes[0].set_ylabel('Final Outbreak Size')
axes[0].set_title(f'Outbreak Size vs Strength (r={corr_strength:.3f})')
axes[0].grid(alpha=0.3)

axes[1].scatter(df['strength'], df['attack_rate'], alpha=0.5, s=30)
axes[1].set_xlabel('Strength')
axes[1].set_ylabel('Attack Rate (fraction infected)')
axes[1].set_title(f'Attack Rate vs Strength')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Save Results

In [None]:
import os
os.makedirs('../results', exist_ok=True)

# Save influence rankings
df.to_csv('../results/similarity_network_sir_influence.csv')
print("Saved: results/similarity_network_sir_influence.csv")

# Save summary
summary = pd.DataFrame([{
    'network_type': 'similarity_weighted',
    'model': 'SIR',
    'num_nodes': G.number_of_nodes(),
    'density': nx.density(G),
    'beta': BETA,
    'gamma': GAMMA,
    'metric': 'final_outbreak',
    'corr_strength': corr_strength,
    'corr_pagerank': corr_pagerank,
    'mean_attack_rate': df['attack_rate'].mean(),
    'top_node': df.index[0],
    'bottom_node': df.index[-1],
}])
summary.to_csv('../results/similarity_network_sir_summary.csv', index=False)
print("Saved: results/similarity_network_sir_summary.csv")

print("\nReady for comparison!")