### 1. Import e-mail data

In [52]:
# In a notebook cell (run once per kernel)
import os, sys
PROJECT_ROOT = os.path.abspath("..")  # if the notebook is in notebooks/, go up one level
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.load_graph_data import load_hetero_pt, load_imdb
from torch_geometric.transforms import ToUndirected

data = load_hetero_pt()

data = ToUndirected()(data)

### 1.1 Only keep certain nodes for training purposes

In [53]:
PROJECT_ROOT = os.path.abspath("..")  # if notebook is in notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.graph_diagnostics import print_connectivity_report

# Which node types do we want to KEEP in the graph?
keep = ['email', 'url', 'domain', 'stem']   # <- adjust as you like

keep_set = set(keep)

# 1) Delete node types that are NOT in keep
for ntype in list(data.node_types):   # list(...) so we can modify while iterating
    if ntype not in keep_set:
        del data[ntype]

# 2) Delete edge types whose src or dst is NOT in keep
for et in list(data.edge_types):      # et is (src, rel, dst)
    src, rel, dst = et
    if src not in keep_set or dst not in keep_set:
        del data[et]

print_connectivity_report(data)



=== Per-relation connectivity ===

Relation: ('email', 'has_url', 'url')  (|E|=21790)
  src[email]  deg0=16524, deg1=8814 ( 30.0% ), deg>=2=4061 ( 13.8% ), total=29399
  dst[url]  deg0=0, deg1=11065 ( 80.6% ), deg>=2=2669 ( 19.4% ), total=13734

Relation: ('url', 'has_domain', 'domain')  (|E|=13730)
  src[url]  deg0=4, deg1=13730 ( 100.0% ), deg>=2=0 ( 0.0% ), total=13734
  dst[domain]  deg0=0, deg1=5248 ( 80.3% ), deg>=2=1285 ( 19.7% ), total=6533

Relation: ('url', 'has_stem', 'stem')  (|E|=13730)
  src[url]  deg0=4, deg1=13730 ( 100.0% ), deg>=2=0 ( 0.0% ), total=13734
  dst[stem]  deg0=0, deg1=7248 ( 97.7% ), deg>=2=174 ( 2.3% ), total=7422

Relation: ('url', 'rev_has_url', 'email')  (|E|=21790)
  src[url]  deg0=0, deg1=11065 ( 80.6% ), deg>=2=2669 ( 19.4% ), total=13734
  dst[email]  deg0=16524, deg1=8814 ( 30.0% ), deg>=2=4061 ( 13.8% ), total=29399

Relation: ('domain', 'rev_has_domain', 'url')  (|E|=13730)
  src[domain]  deg0=0, deg1=5248 ( 80.3% ), deg>=2=1285 ( 19.7% ), total

### 1.2 Normalization of node-features

In [54]:
# Normalization is now handled in the graph builder (core.graph.normalizer.normalize_graph)
pass

### 1.3 Quick sanity check of the data

In [55]:
# ---------- Print a quick summary ----------
print("=== Data loaded ===")
print("Metadata (node_types, edge_types):")
print(data.metadata())  # (['movie','director','actor'], [('movie','to','director'), ...])

# Basic counts per node type
print("\nNode counts:")
for ntype in data.node_types:
    print(f"  {ntype:>12}: {data[ntype].num_nodes}")

# Basic counts per edge type
print("\nEdge counts:")
for et in data.edge_types:
    E = data[et].edge_index.size(1)
    print(f"  {et}: {E}")

# Peek at feature availability
print("\nFeature tensors present?")
for ntype in data.node_types:
    has_x = 'x' in data[ntype]
    shape = tuple(data[ntype].x.shape) if has_x else None
    print(f"  {ntype:>12}: x present? {has_x}, shape={shape}")

=== Data loaded ===
Metadata (node_types, edge_types):
(['email', 'url', 'domain', 'stem'], [('email', 'has_url', 'url'), ('url', 'has_domain', 'domain'), ('url', 'has_stem', 'stem'), ('url', 'rev_has_url', 'email'), ('domain', 'rev_has_domain', 'url'), ('stem', 'rev_has_stem', 'url')])

Node counts:
         email: 29399
           url: 13734
        domain: 6533
          stem: 7422

Edge counts:
  ('email', 'has_url', 'url'): 21790
  ('url', 'has_domain', 'domain'): 13730
  ('url', 'has_stem', 'stem'): 13730
  ('url', 'rev_has_url', 'email'): 21790
  ('domain', 'rev_has_domain', 'url'): 13730
  ('stem', 'rev_has_stem', 'url'): 13730

Feature tensors present?
         email: x present? True, shape=(29399, 388)
           url: x present? True, shape=(13734, 2)
        domain: x present? True, shape=(6533, 10)
          stem: x present? True, shape=(7422, 10)


### 1.5 Setting device and torch seed (so training and eval can be replicated)

In [58]:
import torch

# Prefer CUDA; otherwise use CPU to avoid MPS CSR limitations in neighbor sampling
DEVICE = "mps"
print(DEVICE)
# Pick any integer seed
TORCH_SEED = 42


mps


In [59]:
# Training hyperparameters
PRIMARY_NTYPE = 'email'
HIDDEN_DIM = 128
OUT_DIM = 128
LAYERS = 2
DROPOUT = 0.1
NEG_RATIO = 1.0
BATCH_SIZE = 512
FANOUT = [2, 1]  # 2-hop
VAL_RATIO = 0.1
TEST_RATIO = 0.1
EPOCHS = 30
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 5e-4
SCORE_HEAD = 'dot'
MODEL_SAVE_NAME = 'best_model.pt'
EARLY_STOPPING_PATIENCE = 5


In [60]:
# Train again with stronger negatives; keep your fanout as before
import os, sys
PROJECT_ROOT = os.path.abspath("..")  # if notebook is in notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.train import run_training
from src.eval_link import collect_scores, topk_eval_with_splits


model, predictor, loaders, splits = run_training(
    DEVICE,
    TORCH_SEED,
    data,
    primary_ntype=PRIMARY_NTYPE,
    hidden=HIDDEN_DIM, out_dim=OUT_DIM, layers=LAYERS, dropout=DROPOUT,
    neg_ratio=NEG_RATIO,
    batch_size=BATCH_SIZE, fanout=FANOUT,     # 2-hop
    val_ratio=VAL_RATIO, test_ratio=TEST_RATIO, epochs=EPOCHS, lr=LEARNING_RATE, wd=WEIGHT_DECAY,
    score_head=SCORE_HEAD,
    model_save_name=MODEL_SAVE_NAME,
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
)



Metadata: (['email', 'url', 'domain', 'stem'], [('email', 'has_url', 'url'), ('url', 'has_domain', 'domain'), ('url', 'has_stem', 'stem'), ('url', 'rev_has_url', 'email'), ('domain', 'rev_has_domain', 'url'), ('stem', 'rev_has_stem', 'url')])
Supervised edge types: [('email', 'has_url', 'url'), ('url', 'rev_has_url', 'email')]
Build train graph!
Build link loaders!
Starting training!
[Epoch 01] train loss 22.1326 acc 0.833 | val loss 6.7401 acc 0.794
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN-Campaign-Detection/core/GNN/models/best_model.pt
[Epoch 02] train loss 9.3399 acc 0.911 | val loss 3.6949 acc 0.850
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN-Campaign-Detection/core/GNN/models/best_model.pt
[Epoch 03] train loss 3.5897 acc 0.927 | val loss 3.7557 acc 0.856
[Epoch 04] train loss 4.9075 acc 0.934 | val loss 6.2889 acc 0.796
[Epoch 05] train loss 3.0666 acc 0.938 | val loss 2.7411 acc 0.867
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN

In [16]:
# Resume training from a saved checkpoint

import os, sys
PROJECT_ROOT = os.path.abspath("..")  # if notebook is in notebooks/
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.train import run_training
from src.model_io import load_model_checkpoint

# Optional: check what's saved
model_ckpt, predictor_ckpt, checkpoint = load_model_checkpoint(
    DEVICE, metadata=data.metadata(), filename=MODEL_SAVE_NAME
)
print(f"Checkpoint ready from epoch {checkpoint['epoch']} with val loss {checkpoint['val_loss']:.4f}")

# Continue training; set epochs to total target (not just extra)
model, predictor, loaders, splits = run_training(
    DEVICE,
    TORCH_SEED,
    data,
    primary_ntype=PRIMARY_NTYPE,
    hidden=HIDDEN_DIM, out_dim=OUT_DIM, layers=LAYERS, dropout=DROPOUT,
    neg_ratio=NEG_RATIO,
    batch_size=BATCH_SIZE, fanout=FANOUT,
    val_ratio=VAL_RATIO, test_ratio=TEST_RATIO, epochs=20, lr=LEARNING_RATE, wd=WEIGHT_DECAY,
    score_head=SCORE_HEAD,
    model_save_name=MODEL_SAVE_NAME,
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
    resume_from=MODEL_SAVE_NAME,
)


  checkpoint = torch.load(load_path, map_location=device)
  checkpoint = torch.load(get_models_dir() / filename, map_location=device)


Checkpoint ready from epoch 5 with val loss 0.5770
Metadata: (['email', 'sender', 'url'], [('email', 'has_sender', 'sender'), ('email', 'has_url', 'url'), ('sender', 'rev_has_sender', 'email'), ('url', 'rev_has_url', 'email')])
Resuming from epoch 5 (best val 0.5770) using checkpoint 'best_model.pt'
Starting training!
[Epoch 06] train loss 0.5849 acc 0.962 | val loss 0.5195 acc 0.882
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN-Campaign-Detection/core/GNN/models/best_model.pt
[Epoch 07] train loss 0.7713 acc 0.965 | val loss 0.8037 acc 0.880
[Epoch 08] train loss 0.2400 acc 0.966 | val loss 0.4832 acc 0.896
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN-Campaign-Detection/core/GNN/models/best_model.pt
[Epoch 09] train loss 0.4104 acc 0.970 | val loss 0.5190 acc 0.909
[Epoch 10] train loss 0.2990 acc 0.970 | val loss 0.3417 acc 0.916
✓ Model saved to /Users/mcandersyo/ITU/Research Project/GNN-Campaign-Detection/core/GNN/models/best_model.pt
[Epoch 11] train l