### 1. Import e-mail data

In [None]:
# In a notebook cell (run once per kernel)
import os, sys
PROJECT_ROOT = os.path.abspath("..")  # if the notebook is in notebooks/, go up one level
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.load_graph_data import load_hetero_pt, load_imdb
from torch_geometric.transforms import ToUndirected

data = load_hetero_pt()

data = ToUndirected()(data)

### 1.1 Only keep certain nodes for training purposes

In [6]:
# Which node types do we want to KEEP in the graph?
keep = ['email', 'sender', 'url', 'domain', 'stem', 'body_cluster']   # <- adjust as you like

keep_set = set(keep)

# 1) Delete node types that are NOT in keep
for ntype in list(data.node_types):   # list(...) so we can modify while iterating
    if ntype not in keep_set:
        del data[ntype]

# 2) Delete edge types whose src or dst is NOT in keep
for et in list(data.edge_types):      # et is (src, rel, dst)
    src, rel, dst = et
    if src not in keep_set or dst not in keep_set:
        del data[et]





In [3]:
from src.graph_diagnostics import print_connectivity_report
PROJECT_ROOT = os.path.abspath("..")  # if notebook is in notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print_connectivity_report(data)

=== Per-relation connectivity ===

Relation: ('email', 'has_sender', 'sender')  (|E|=13884)
  src[email]: 28305 nodes (this relation goes OUT from these nodes)
    - no outgoing edges in this relation (deg0): 14421
    - exactly 1 outgoing edge (deg1): 13884 (49.1%)
    - 2+ outgoing edges (deg>=2): 0 (0.0%)
  dst[sender]: 11932 nodes (this relation comes IN to these nodes)
    - never used as destination (deg0): 0
    - used by exactly 1 source (deg1): 11454 (96.0%)
    - used by 2+ sources (deg>=2): 478 (4.0%)

Relation: ('email', 'has_url', 'url')  (|E|=10900)
  src[email]: 28305 nodes (this relation goes OUT from these nodes)
    - no outgoing edges in this relation (deg0): 21074
    - exactly 1 outgoing edge (deg1): 5107 (18.0%)
    - 2+ outgoing edges (deg>=2): 2124 (7.5%)
  dst[url]: 2613 nodes (this relation comes IN to these nodes)
    - never used as destination (deg0): 0
    - used by exactly 1 source (deg1): 0 (0.0%)
    - used by 2+ sources (deg>=2): 2613 (100.0%)

Relatio

### 1.2 Normalization of node-features

In [3]:
# Normalization is now handled in the graph builder (core.graph.normalizer.normalize_graph)
pass

### 1.3 Quick sanity check of the data

In [7]:
# ---------- Print a quick summary ----------
print("=== Data loaded ===")
print("Metadata (node_types, edge_types):")
print(data.metadata())  # (['movie','director','actor'], [('movie','to','director'), ...])

# Basic counts per node type
print("\nNode counts:")
for ntype in data.node_types:
    print(f"  {ntype:>12}: {data[ntype].num_nodes}")

# Basic counts per edge type
print("\nEdge counts:")
for et in data.edge_types:
    E = data[et].edge_index.size(1)
    print(f"  {et}: {E}")

# Peek at feature availability
print("\nFeature tensors present?")
for ntype in data.node_types:
    has_x = 'x' in data[ntype]
    shape = tuple(data[ntype].x.shape) if has_x else None
    print(f"  {ntype:>12}: x present? {has_x}, shape={shape}")

=== Data loaded ===
Metadata (node_types, edge_types):
(['email', 'sender', 'url', 'domain', 'stem', 'body_cluster'], [('email', 'has_sender', 'sender'), ('email', 'has_url', 'url'), ('email', 'has_domain', 'domain'), ('email', 'has_stem', 'stem'), ('email', 'has_body_cluster', 'body_cluster'), ('sender', 'rev_has_sender', 'email'), ('url', 'rev_has_url', 'email'), ('domain', 'rev_has_domain', 'email'), ('stem', 'rev_has_stem', 'email'), ('body_cluster', 'rev_has_body_cluster', 'email')])

Node counts:
         email: 28305
        sender: 11932
           url: 2613
        domain: 2769
          stem: 1225
  body_cluster: 300

Edge counts:
  ('email', 'has_sender', 'sender'): 13884
  ('email', 'has_url', 'url'): 10900
  ('email', 'has_domain', 'domain'): 18028
  ('email', 'has_stem', 'stem'): 15638
  ('email', 'has_body_cluster', 'body_cluster'): 28305
  ('sender', 'rev_has_sender', 'email'): 13884
  ('url', 'rev_has_url', 'email'): 10900
  ('domain', 'rev_has_domain', 'email'): 18028

### 1.5 Setting device and torch seed (so training and eval can be replicated)

In [6]:
import torch

# Prefer CUDA; otherwise use CPU to avoid MPS CSR limitations in neighbor sampling
DEVICE = "mps"
print(DEVICE)
# Pick any integer seed
TORCH_SEED = 42


mps


In [7]:
# Training hyperparameters
PRIMARY_NTYPE = 'email'
HIDDEN_DIM = 128
OUT_DIM = 128
LAYERS = 2
DROPOUT = 0.3
NEG_RATIO = 1.0
BATCH_SIZE = 256
FANOUT = FANOUT = [2, 1, 3]  # 2-hop
VAL_RATIO = 0.1
TEST_RATIO = 0.1
EPOCHS = 30
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 5e-4
SCORE_HEAD = 'dot'
MODEL_SAVE_NAME = 'best_model.pt'
EARLY_STOPPING_PATIENCE = 5


In [8]:
# Train again with stronger negatives; keep your fanout as before
import os, sys
PROJECT_ROOT = os.path.abspath("..")  # if notebook is in notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.train import run_training


model, predictor, loaders, splits = run_training(
    DEVICE,
    TORCH_SEED,
    data,
    primary_ntype=PRIMARY_NTYPE,
    hidden=HIDDEN_DIM, out_dim=OUT_DIM, layers=LAYERS, dropout=DROPOUT,
    neg_ratio=NEG_RATIO,
    batch_size=BATCH_SIZE, fanout=FANOUT,     # 3-hop
    val_ratio=VAL_RATIO, test_ratio=TEST_RATIO, epochs=EPOCHS, lr=LEARNING_RATE, wd=WEIGHT_DECAY,
    score_head=SCORE_HEAD,
    model_save_name=MODEL_SAVE_NAME,
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
)



Metadata: (['email', 'sender', 'url', 'domain', 'stem', 'body_cluster'], [('email', 'has_sender', 'sender'), ('email', 'has_url', 'url'), ('email', 'has_domain', 'domain'), ('email', 'has_stem', 'stem'), ('email', 'has_body_cluster', 'body_cluster'), ('body_cluster', 'body_cluster_has_email', 'email'), ('sender', 'rev_has_sender', 'email'), ('url', 'rev_has_url', 'email'), ('domain', 'rev_has_domain', 'email'), ('stem', 'rev_has_stem', 'email'), ('body_cluster', 'rev_has_body_cluster', 'email'), ('email', 'rev_body_cluster_has_email', 'body_cluster')])
Supervised edge types: [('email', 'has_sender', 'sender'), ('email', 'has_url', 'url'), ('email', 'has_domain', 'domain'), ('email', 'has_stem', 'stem'), ('email', 'has_body_cluster', 'body_cluster'), ('body_cluster', 'body_cluster_has_email', 'email'), ('sender', 'rev_has_sender', 'email'), ('url', 'rev_has_url', 'email'), ('domain', 'rev_has_domain', 'email'), ('stem', 'rev_has_stem', 'email'), ('body_cluster', 'rev_has_body_cluster', 

train epoch:   6%|▌         | 44/726 [00:26<09:51,  1.15it/s]

[train] ('email', 'has_sender', 'sender'): 44 batches in 26.8s (1.64 batches/s)


train epoch:  11%|█         | 79/726 [01:04<15:57,  1.48s/it]

[train] ('email', 'has_url', 'url'): 35 batches in 37.7s (0.93 batches/s)


train epoch:  19%|█▊        | 136/726 [02:23<15:49,  1.61s/it]

[train] ('email', 'has_domain', 'domain'): 57 batches in 79.1s (0.72 batches/s)


train epoch:  25%|██▌       | 185/726 [04:06<21:48,  2.42s/it]

[train] ('email', 'has_stem', 'stem'): 49 batches in 102.5s (0.48 batches/s)


train epoch:  38%|███▊      | 274/726 [08:02<22:26,  2.98s/it]

[train] ('email', 'has_body_cluster', 'body_cluster'): 89 batches in 236.6s (0.38 batches/s)


train epoch:  50%|█████     | 363/726 [12:18<25:25,  4.20s/it]

[train] ('body_cluster', 'body_cluster_has_email', 'email'): 89 batches in 255.7s (0.35 batches/s)


train epoch:  56%|█████▌    | 407/726 [15:51<28:52,  5.43s/it]

[train] ('sender', 'rev_has_sender', 'email'): 44 batches in 213.3s (0.21 batches/s)


train epoch:  61%|██████    | 442/726 [18:48<27:33,  5.82s/it]

[train] ('url', 'rev_has_url', 'email'): 35 batches in 176.4s (0.20 batches/s)


train epoch:  69%|██████▊   | 499/726 [23:35<23:01,  6.09s/it]

[train] ('domain', 'rev_has_domain', 'email'): 57 batches in 287.8s (0.20 batches/s)


train epoch:  75%|███████▌  | 548/726 [28:08<20:46,  7.00s/it]

[train] ('stem', 'rev_has_stem', 'email'): 49 batches in 272.1s (0.18 batches/s)


train epoch:  88%|████████▊ | 637/726 [37:56<12:38,  8.52s/it]

[train] ('body_cluster', 'rev_has_body_cluster', 'email'): 89 batches in 588.7s (0.15 batches/s)


                                                              

[train] ('email', 'rev_body_cluster_has_email', 'body_cluster'): 89 batches in 914.5s (0.10 batches/s)


eval epoch:   6%|▌         | 6/100 [00:44<11:56,  7.62s/it]

[eval] ('email', 'has_sender', 'sender'): 6 batches in 44.3s (0.14 batches/s)


eval epoch:  11%|█         | 11/100 [02:10<26:29, 17.86s/it]

[eval] ('email', 'has_url', 'url'): 5 batches in 86.4s (0.06 batches/s)


eval epoch:  19%|█▉        | 19/100 [03:11<09:58,  7.39s/it]

[eval] ('email', 'has_domain', 'domain'): 8 batches in 60.4s (0.13 batches/s)


eval epoch:  26%|██▌       | 26/100 [03:55<08:51,  7.18s/it]

[eval] ('email', 'has_stem', 'stem'): 7 batches in 44.3s (0.16 batches/s)


eval epoch:  38%|███▊      | 38/100 [05:29<09:01,  8.74s/it]

[eval] ('email', 'has_body_cluster', 'body_cluster'): 12 batches in 93.8s (0.13 batches/s)


eval epoch:  50%|█████     | 50/100 [06:21<04:12,  5.04s/it]

[eval] ('body_cluster', 'body_cluster_has_email', 'email'): 12 batches in 52.4s (0.23 batches/s)


eval epoch:  56%|█████▌    | 56/100 [06:56<04:35,  6.27s/it]

[eval] ('sender', 'rev_has_sender', 'email'): 6 batches in 35.3s (0.17 batches/s)


eval epoch:  61%|██████    | 61/100 [07:34<05:12,  8.00s/it]

[eval] ('url', 'rev_has_url', 'email'): 5 batches in 37.3s (0.13 batches/s)


eval epoch:  69%|██████▉   | 69/100 [11:49<24:20, 47.11s/it]

[eval] ('domain', 'rev_has_domain', 'email'): 8 batches in 255.6s (0.03 batches/s)


eval epoch:  76%|███████▌  | 76/100 [16:08<15:25, 38.55s/it]

[eval] ('stem', 'rev_has_stem', 'email'): 7 batches in 258.4s (0.03 batches/s)


eval epoch:  88%|████████▊ | 88/100 [20:19<03:20, 16.68s/it]

[eval] ('body_cluster', 'rev_has_body_cluster', 'email'): 12 batches in 251.7s (0.05 batches/s)


                                                             

[eval] ('email', 'rev_body_cluster_has_email', 'body_cluster'): 12 batches in 174.6s (0.07 batches/s)
[Epoch 01] train loss 0.5326 acc 0.809 | val loss 0.5329 acc 0.806
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN-Campaign-Detection/core/GNN/models/best_model.pt


train epoch:   3%|▎         | 20/726 [07:41<6:51:19, 34.96s/it]

KeyboardInterrupt: 

In [8]:
# Resume training from a saved checkpoint

import os, sys
PROJECT_ROOT = os.path.abspath("..")  # if notebook is in notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.train import run_training
from src.model_io import load_model_checkpoint

# Optional: check what's saved
model_ckpt, predictor_ckpt, checkpoint = load_model_checkpoint(
    DEVICE, metadata=data.metadata(), filename=MODEL_SAVE_NAME
)
print(f"Checkpoint ready from epoch {checkpoint['epoch']} with val loss {checkpoint['val_loss']:.4f}")

# Continue training; set epochs to total target (not just extra)
model, predictor, loaders, splits = run_training(
    DEVICE,
    TORCH_SEED,
    data,
    primary_ntype=PRIMARY_NTYPE,
    hidden=HIDDEN_DIM, out_dim=OUT_DIM, layers=LAYERS, dropout=DROPOUT,
    neg_ratio=NEG_RATIO,
    batch_size=BATCH_SIZE, fanout=FANOUT,
    val_ratio=VAL_RATIO, test_ratio=TEST_RATIO, epochs=20, lr=LEARNING_RATE, wd=WEIGHT_DECAY,
    score_head=SCORE_HEAD,
    model_save_name=MODEL_SAVE_NAME,
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
    resume_from=MODEL_SAVE_NAME,
)


  checkpoint = torch.load(load_path, map_location=device)
  checkpoint = torch.load(get_models_dir() / filename, map_location=device)


Checkpoint ready from epoch 4 with val loss 0.5040
Metadata: (['email', 'sender', 'receiver', 'url'], [('email', 'has_sender', 'sender'), ('email', 'has_receiver', 'receiver'), ('email', 'has_url', 'url'), ('sender', 'rev_has_sender', 'email'), ('receiver', 'rev_has_receiver', 'email'), ('url', 'rev_has_url', 'email')])


NotImplementedError: The operator 'aten::_convert_indices_from_coo_to_csr.out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.