# Stage 0: EDA and Data Loading

This notebook covers the initial setup, data loading, and basic exploratory data analysis for the project.

In [None]:
# Colab: GPU runtime recommended. Run these cells to prepare environment.
# 1) Basic python libs
!pip install -q numpy pandas scipy scikit-learn networkx matplotlib tqdm pyarrow pytest

# 2) Install PyTorch (choose appropriate CUDA) - Colab usually has CUDA 11.8 or similar
# CPU fallback:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

# 3) Install PyG (this is the simple approach — adjust if CUDA available)
!pip install -q torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric

# If the above PyG install fails, use PyG official install instructions and pick matching CUDA wheel.

## Load Sample Data

In [None]:
import sys
import os
# Add src to path
sys.path.append('../')

from src.data_utils import load_csv_nodes_edges, build_hetero_data

nodes_csv = '../data/sample/nodes.csv'
edges_csv = '../data/sample/edges.csv'

nodes, edges = load_csv_nodes_edges(nodes_csv, edges_csv)
data = build_hetero_data(nodes, edges)

print("HeteroData created:")
for key in data.metadata():
    print(key)

## Basic EDA

In [None]:
import matplotlib.pyplot as plt

# Plot node degree distribution
node_degrees = {}
for node_type in data.node_types:
    node_degrees[node_type] = data[node_type].num_nodes

plt.bar(node_degrees.keys(), node_degrees.values())
plt.title("Node Counts per Type")
plt.show()

# Plot edge time distribution
for edge_type in data.edge_types:
    plt.hist(data[edge_type].time.numpy(), bins=50)
    plt.title(f"Time Distribution for {edge_type}")
    plt.show()

## Tiny GCN Forward Pass (Smoke Test)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, HeteroConv

class ToyGCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = HeteroConv({
            ('user', 'user->merchant', 'merchant'): GCNConv(-1, hidden_channels),
        }, aggr='sum')

    def forward(self, x_dict, edge_index_dict):
        x = self.conv1(x_dict, edge_index_dict)
        return x

model = ToyGCN(hidden_channels=16)
output = model(data.x_dict, data.edge_index_dict)
print("Toy GCN output shape:", {key: val.shape for key, val in output.items()})