# Data Exploration: Amazon Product Co-Purchasing Network

This notebook performs initial exploration and preprocessing of the Amazon Product Co-Purchasing Network dataset from SNAP.

## Objectives:
1. Load the dataset
2. Clean and preprocess the graph
3. Explore basic network properties
4. Visualize degree distributions
5. Prepare data for link prediction tasks


## 1. Import Libraries

Import all necessary libraries for data loading, preprocessing, analysis, and visualization.


In [None]:
# Standard library imports
import os
import sys
from pathlib import Path

# Add src directory to path for imports
# Get the project root (parent of notebooks directory)
# In Jupyter, cwd is typically the project root, but we handle both cases
if Path.cwd().name == 'notebooks':
    project_root = Path.cwd().parent
else:
    project_root = Path.cwd()

sys.path.insert(0, str(project_root / 'src'))
# Change to project root for data paths
os.chdir(project_root)
print(f"Project root: {project_root}")
print(f"Working directory: {os.getcwd()}")

# Data manipulation
import pandas as pd
import numpy as np

# Network analysis
import networkx as nx

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Project modules
from data_loader import (
    download_dataset, 
    load_graph, 
    load_communities,
    save_graph
)
from preprocessing import (
    remove_self_loops,
    get_largest_component,
    basic_statistics,
    create_train_test_split,
    save_splits
)
from exploratory_analysis import (
    degree_distribution,
    plot_degree_distribution,
    compute_network_stats,
    generate_statistics_report
)

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid' if 'seaborn-v0_8-darkgrid' in plt.style.available 
              else 'seaborn-darkgrid' if 'seaborn-darkgrid' in plt.style.available 
              else 'ggplot')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 2. Load Dataset

Download and load the Amazon co-purchasing network dataset. The dataset will be automatically downloaded if it doesn't exist in the `data/raw/` directory.


In [None]:
# Download dataset (if not already present)
print("Downloading dataset (if needed)...")
filepaths = download_dataset(data_dir="data/raw")
print(f"Graph file: {filepaths['graph']}")
print(f"Communities file: {filepaths['communities']}")

# Load the graph
print("\nLoading graph...")
G = load_graph(filepaths['graph'], is_gzipped=True)
print(f"Initial graph loaded: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")

# Load ground-truth communities
print("\nLoading communities...")
communities = load_communities(filepaths['communities'], is_gzipped=True)
print(f"Loaded {len(communities):,} communities")


## 3. Preprocessing

Clean the graph by removing self-loops and extracting the largest connected component. This ensures we work with a clean, connected network.


In [None]:
# Step 1: Remove self-loops
print("Removing self-loops...")
G_cleaned = remove_self_loops(G)
print(f"After removing self-loops: {G_cleaned.number_of_nodes():,} nodes, "
      f"{G_cleaned.number_of_edges():,} edges")

# Step 2: Extract largest connected component
print("\nExtracting largest connected component...")
G_largest = get_largest_component(G_cleaned)
print(f"Largest component: {G_largest.number_of_nodes():,} nodes, "
      f"{G_largest.number_of_edges():,} edges")

# Calculate percentage of nodes retained
node_retention = (G_largest.number_of_nodes() / G.number_of_nodes()) * 100
print(f"\nNode retention: {node_retention:.2f}% of original graph")


## 4. Basic Statistics

Compute and display basic network statistics in a formatted table.


In [None]:
# Compute basic statistics
print("Computing basic statistics...")
stats = basic_statistics(G_largest)

# Display as formatted table
stats_df = pd.DataFrame([stats]).T
stats_df.columns = ['Value']
stats_df.index.name = 'Metric'

# Format the display
print("\n" + "="*60)
print("BASIC NETWORK STATISTICS")
print("="*60)
print(stats_df.to_string())
print("="*60)


## 5. Visualizations

Create visualizations to understand the network structure, including degree distributions and network statistics.


In [None]:
# Create output directory for figures
os.makedirs("results/figures", exist_ok=True)

# 5.1 Degree Distribution (Linear and Log-Log)
print("Creating degree distribution plots...")
plot_degree_distribution(G_largest, save_path="results/figures/degree_distribution.png")
print("Saved: results/figures/degree_distribution.png")


In [None]:
# 5.2 Degree Histogram with Statistics
print("Creating degree histogram...")
degree_series = degree_distribution(G_largest)

fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(degree_series.values, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
ax.set_xlabel('Degree', fontsize=12, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax.set_title('Degree Distribution Histogram', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)

# Add statistics text
stats_text = f'Mean: {degree_series.mean():.2f}\n'
stats_text += f'Median: {degree_series.median():.2f}\n'
stats_text += f'Std: {degree_series.std():.2f}\n'
stats_text += f'Min: {degree_series.min()}\n'
stats_text += f'Max: {degree_series.max()}'
ax.text(0.7, 0.95, stats_text, transform=ax.transAxes,
        fontsize=11, verticalalignment='top',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.7))

plt.tight_layout()
plt.savefig("results/figures/degree_histogram.png", dpi=300, bbox_inches='tight')
plt.show()
print("Saved: results/figures/degree_histogram.png")


In [None]:
# 5.3 Network Statistics Bar Chart
print("Creating network statistics bar chart...")
# Compute comprehensive statistics
network_stats = compute_network_stats(G_largest, sample_size=10000)

# Select key statistics for visualization
viz_stats = {
    'Number of Nodes': network_stats['num_nodes'],
    'Number of Edges': network_stats['num_edges'],
    'Density (×10^6)': network_stats['density'] * 1e6,  # Scale for visibility
    'Avg Degree': network_stats['avg_degree'],
    'Avg Clustering': network_stats['avg_clustering'],
    'Num Triangles (×10^3)': network_stats['num_triangles'] / 1000,  # Scale for visibility
}

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(viz_stats.keys(), viz_stats.values(), 
              color=['steelblue', 'coral', 'lightgreen', 'gold', 'plum', 'skyblue'],
              edgecolor='black', alpha=0.7)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2f}',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_ylabel('Value', fontsize=12, fontweight='bold')
ax.set_title('Network Statistics Summary', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("results/figures/network_statistics.png", dpi=300, bbox_inches='tight')
plt.show()
print("Saved: results/figures/network_statistics.png")


## 6. Save Cleaned Graph

Save the preprocessed graph for future use to avoid reprocessing.


In [None]:
# Save the cleaned graph
print("Saving cleaned graph...")
cleaned_graph_path = "data/processed/amazon_graph_cleaned.pkl"
save_graph(G_largest, cleaned_graph_path)
print(f"Cleaned graph saved to: {cleaned_graph_path}")
print(f"Graph summary: {G_largest.number_of_nodes():,} nodes, "
      f"{G_largest.number_of_edges():,} edges")


## 7. Train/Test Split for Link Prediction

Split the graph edges into training and test sets for link prediction tasks. This creates positive and negative examples for evaluation.


In [None]:
# Create train/test split (80% train, 20% test)
print("Creating train/test split for link prediction...")
G_train, positive_test_edges, negative_test_edges = create_train_test_split(
    G_largest, 
    test_ratio=0.2, 
    seed=42
)

print(f"\nSplit Summary:")
print(f"  Training graph: {G_train.number_of_nodes():,} nodes, "
      f"{G_train.number_of_edges():,} edges")
print(f"  Positive test edges: {len(positive_test_edges):,}")
print(f"  Negative test edges: {len(negative_test_edges):,}")
print(f"  Test ratio: {len(positive_test_edges) / (len(positive_test_edges) + G_train.number_of_edges()):.2%}")

# Save the splits
print("\nSaving train/test splits...")
save_splits(G_train, positive_test_edges, negative_test_edges, 
            output_dir="data/processed/splits")
print("Splits saved to: data/processed/splits/")


## 8. Ground-Truth Communities Summary

Analyze and display summary statistics about the ground-truth communities in the dataset.


In [None]:
# Analyze communities
print("Analyzing ground-truth communities...")
print("="*60)

# Basic community statistics
community_sizes = [len(nodes) for nodes in communities.values()]
community_sizes_series = pd.Series(community_sizes)

print(f"Total number of communities: {len(communities):,}")
print(f"\nCommunity Size Statistics:")
print(f"  Minimum size: {min(community_sizes):,}")
print(f"  Maximum size: {max(community_sizes):,}")
print(f"  Mean size: {np.mean(community_sizes):.2f}")
print(f"  Median size: {np.median(community_sizes):.2f}")
print(f"  Standard deviation: {np.std(community_sizes):.2f}")

# Count nodes in communities
total_nodes_in_communities = sum(community_sizes)
unique_nodes_in_communities = len(set(node for nodes in communities.values() for node in nodes))
print(f"\nNode Coverage:")
print(f"  Total nodes in communities: {total_nodes_in_communities:,}")
print(f"  Unique nodes in communities: {unique_nodes_in_communities:,}")
print(f"  Coverage of graph nodes: {(unique_nodes_in_communities / G_largest.number_of_nodes() * 100):.2f}%")

# Display distribution of community sizes
print(f"\nCommunity Size Distribution (top 10):")
print(community_sizes_series.value_counts().head(10).to_string())

print("="*60)


In [None]:
# Visualize community size distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Histogram of community sizes
ax1.hist(community_sizes, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
ax1.set_xlabel('Community Size', fontsize=12, fontweight='bold')
ax1.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax1.set_title('Community Size Distribution', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Log-log plot
community_size_counts = pd.Series(community_sizes).value_counts().sort_index()
community_size_counts = community_size_counts[community_size_counts > 0]
ax2.scatter(community_size_counts.index, community_size_counts.values,
           alpha=0.6, s=50, color='coral', edgecolors='black', linewidth=0.5)
ax2.set_xlabel('Community Size (log scale)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Frequency (log scale)', fontsize=12, fontweight='bold')
ax2.set_title('Community Size Distribution (Log-Log)', fontsize=13, fontweight='bold')
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.grid(True, alpha=0.3, which='both')

plt.tight_layout()
plt.savefig("results/figures/community_size_distribution.png", dpi=300, bbox_inches='tight')
plt.show()
print("Saved: results/figures/community_size_distribution.png")


## Summary

This notebook has completed the initial data exploration:

✅ **Dataset Loaded**: Amazon co-purchasing network with ground-truth communities  
✅ **Preprocessed**: Removed self-loops and extracted largest component  
✅ **Statistics Computed**: Basic and comprehensive network metrics  
✅ **Visualizations Created**: Degree distributions and network statistics  
✅ **Graph Saved**: Cleaned graph saved for future use  
✅ **Splits Created**: Train/test split prepared for link prediction  
✅ **Communities Analyzed**: Ground-truth community statistics computed  

### Next Steps:
- Use the cleaned graph for community detection algorithms
- Train link prediction models using the train/test splits
- Perform deeper network analysis and feature engineering
