# Stock Market Social Network - Temporal Link Prediction Pipeline

Build quarterly bipartite graphs of fund-stock holdings and perform temporal link prediction using sliding windows (2021-2024).

**Key Features:**
- Load holdings data 2021-2024 only
- Build separate bipartite graph per quarter
- Sliding window: train on 8 quarters (2 years), predict next quarter
- Strict temporal causality (no future leakage)
- Per-quarter evaluation metrics (AUC, Precision, Recall)

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import os
import re
import warnings
import glob

# Graph libraries
import networkx as nx
from networkx.algorithms import bipartite
from networkx.algorithms.centrality import degree_centrality, closeness_centrality
from networkx.algorithms.link_analysis.pagerank_alg import pagerank
from networkx.algorithms.link_analysis.hits_alg import hits
import igraph as ig
import leidenalg as la

# ML libraries
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
import joblib

# Deep learning
import torch
from torch_geometric.nn import SAGEConv

warnings.filterwarnings('ignore')

# GPU Setup
def check_cuda_compatibility():
    if not torch.cuda.is_available():
        return False, "CUDA not available"
    try:
        test_tensor = torch.zeros(1).cuda()
        test_tensor = test_tensor + 1
        return True, "CUDA compatible"
    except Exception as e:
        return False, f"CUDA compatibility issue: {str(e)}"

cuda_compatible, cuda_message = check_cuda_compatibility()
print(f"CUDA Status: {cuda_message}")

if cuda_compatible:
    device = torch.device('cuda')
    print(f'GPU: {torch.cuda.get_device_name(0)} | CUDA: {torch.version.cuda}')
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    device = torch.device('cpu')
    print('Using CPU (GPU not available)')

## 1. Data Setup and Loading
Load quarterly holdings data from processed parquet files (2021-2024 only).

In [None]:
# Setup paths
personal_dir = os.path.expanduser('~')
root = os.path.join(personal_dir, 'Social-Network-Stock-Market/Social Network/parquuet_files')
output_dir = os.path.join(root, 'generated_combined_parquet')

print(f"Data directory: {root}")
print(f"Output directory: {output_dir}")

# Load reference data
ticker_map = pd.read_parquet(f"{root}/ticker_to_cusip.parquet")
prices = pd.read_parquet(f"{root}/ticker_prices.parquet")
ticker_map["cusip"] = ticker_map["cusip"].astype(str)
prices["period_start"] = pd.to_datetime(prices["period_start"])

print(f"✓ Ticker map: {ticker_map.shape}")
print(f"✓ Prices: {prices.shape}")

In [None]:
# Load all processed quarterly holdings files
print("=" * 80)
print("Loading quarterly holdings data (2021-2024)...")
print("=" * 80)

combined_files = sorted([f for f in os.listdir(output_dir) 
                        if f.startswith('holdings_processed_') and f.endswith('.parquet')])

if not combined_files:
    print("ERROR: No processed files found. Check output_dir path.")
else:
    all_dfs = []
    for file in combined_files:
        df_temp = pd.read_parquet(os.path.join(output_dir, file))
        year = df_temp['YEAR'].iloc[0]
        quarter_str = df_temp['QUARTER'].iloc[0]
        print(f"  ✓ Loaded {file}: {len(df_temp):,} records ({quarter_str})")
        all_dfs.append(df_temp)
    
    data = pd.concat(all_dfs, ignore_index=True)
    data['PERIOD_DATE'] = pd.to_datetime(data['PERIOD_DATE'])
    
    # Filter to 2021-2024 ONLY
    data = data[(data['YEAR'] >= 2021) & (data['YEAR'] <= 2024)].copy()
    
    print(f"\n{'─' * 80}")
    print(f"Total records (2021-2024): {len(data):,}")
    print(f"Date range: {data['PERIOD_DATE'].min()} to {data['PERIOD_DATE'].max()}")
    print(f"Years: {sorted(data['YEAR'].unique())}")
    print(f"Unique funds (CIK): {data['CIK'].nunique():,}")
    print(f"Unique stocks (CUSIP): {data['CUSIP'].nunique():,}")

## 2. Quarterly Graph Construction (2021-2024 only)
Build separate bipartite graphs for each quarter from 2021-2024.

In [None]:
def build_quarterly_graphs(data):
    """
    Build separate bipartite graphs for each quarter.
    Data should already be filtered to desired time range.
    
    Args:
        data: DataFrame with columns [CIK, CUSIP, VALUE, SSHPRNAMT, PERIOD_DATE, YEAR, QUARTER]
    
    Returns:
        Dictionary: {(year, quarter): bipartite_graph}
    """
    quarterly_graphs = {}
    
    # Group by YEAR and extract quarter from QUARTER column
    for (year, quarter_str), group in data.groupby(['YEAR', 'QUARTER']):
        quarter = int(quarter_str.split('_')[0][1])  # Extract Q number from "Q1_2020"
        
        funds = group['CIK'].unique()
        stocks = group['CUSIP'].unique()
        
        # Build bipartite graph
        G_bip = nx.Graph()
        G_bip.add_nodes_from(funds, bipartite=0, node_type='fund')
        G_bip.add_nodes_from(stocks, bipartite=1, node_type='stock')
        
        # Add edges with VALUE weight
        edges = [
            (row.CIK, row.CUSIP, {'value': row.VALUE, 'amount': row.SSHPRNAMT})
            for row in group.itertuples(index=False)
        ]
        G_bip.add_edges_from(edges)
        
        quarterly_graphs[(year, quarter)] = G_bip
        print(f"  {year} Q{quarter}: {len(funds):,} funds, {len(stocks):,} stocks, {G_bip.number_of_edges():,} edges")
    
    return quarterly_graphs

print("Building quarterly bipartite graphs (2021-2024 only)...")
quarterly_graphs = build_quarterly_graphs(data)
print(f"\nTotal quarters: {len(quarterly_graphs)}")
if quarterly_graphs:
    min_q, max_q = min(quarterly_graphs.keys()), max(quarterly_graphs.keys())
    print(f"Date range: {min_q} to {max_q}")

## 3. Sliding Window Utilities
Implement temporal train/test splits with 8-quarter (2-year) training window.

In [None]:
def get_chronological_quarters(quarterly_graphs):
    """Get all quarters in chronological order."""
    return sorted(quarterly_graphs.keys())

def build_combined_training_graph(quarterly_graphs, quarters_list):
    """
    Combine multiple quarterly graphs into ONE training graph.
    This is the UNION of all edges across the training quarters.
    
    Args:
        quarterly_graphs: Dict {(year, quarter): graph}
        quarters_list: List of (year, quarter) tuples to combine
    
    Returns:
        Single bipartite graph with all edges from training quarters
    """
    if not quarters_list:
        raise ValueError("quarters_list cannot be empty")
    
    G_train = nx.Graph()
    
    # Add all nodes and edges from each quarter
    for yq in quarters_list:
        if yq not in quarterly_graphs:
            continue
        G_q = quarterly_graphs[yq]
        G_train.add_nodes_from(G_q.nodes(data=True))
        G_train.add_edges_from(G_q.edges(data=True))
    
    return G_train

def get_sliding_window_splits(chronological_quarters, train_window=3, test_offset=1):
    """
    Generate temporal train/test splits using sliding window.
    
    Args:
        chronological_quarters: Sorted list of (year, quarter) tuples
        train_window: Number of quarters for training
        test_offset: Quarters ahead to test (default: 1 = immediate next quarter)
    
    Yields:
        (train_quarters_list, test_quarter)
    """
    n = len(chronological_quarters)
    
    if n < train_window + test_offset:
        print(f"WARNING: Only {n} quarters available, need {train_window + test_offset}")
        return
    
    for i in range(n - train_window - test_offset + 1):
        train_quarters = chronological_quarters[i : i + train_window]
        test_quarter = chronological_quarters[i + train_window + test_offset - 1]
        yield train_quarters, test_quarter

# Show example sliding windows
chrono_quarters = get_chronological_quarters(quarterly_graphs)
print(f"Total quarters available: {len(chrono_quarters)}")
print(f"All quarters: {chrono_quarters}")

print("\nSliding window examples (train_window=3, test_offset=1):")
for i, (train_q, test_q) in enumerate(list(get_sliding_window_splits(chrono_quarters, train_window=3))[:3]):
    print(f"  Window {i+1}:")
    print(f"    Train: {[f'{y}Q{q}' for y, q in train_q]}")
    print(f"    Test:  {test_q[0]}Q{test_q[1]}")

## 4. Graph Features: Centrality & Community Detection
Compute topological features from training graph only (no future leakage).

In [None]:
def compute_fund_features(G_bip, funds):
    """
    Compute topological features for funds from bipartite graph.
    Features computed only from G_bip (no future information).
    
    Args:
        G_bip: Bipartite graph (fund-stock holdings)
        funds: List of fund CIKs
    
    Returns:
        DataFrame with features: degree, pagerank, hub, authority, closeness, community
    """
    if len(funds) == 0:
        return pd.DataFrame()
    
    # Project to fund-fund graph (shared stock holdings)
    try:
        G_fund = bipartite.weighted_projected_graph(G_bip, funds)
    except:
        G_fund = nx.Graph()
        G_fund.add_nodes_from(funds)
    
    # Centrality metrics
    degree_cent = degree_centrality(G_fund) if G_fund.number_of_nodes() > 0 else {}
    pagerank_cent = nx.pagerank(G_fund) if G_fund.number_of_nodes() > 0 else {}
    
    try:
        hubs, authorities = hits(G_fund)
    except:
        hubs = {f: 0 for f in funds}
        authorities = {f: 0 for f in funds}
    
    # Closeness on largest component
    closeness_cent = {}
    if G_fund.number_of_nodes() > 0:
        try:
            comps = list(nx.connected_components(G_fund))
            if comps:
                largest_cc = max(comps, key=len)
                closeness_cent = closeness_centrality(G_fund.subgraph(largest_cc))
        except:
            pass
    
    # Community detection (Leiden algorithm)
    communities = {}
    if G_fund.number_of_nodes() > 1:
        try:
            vertex_names = list(G_fund.nodes())
            vertex_to_idx = {v: i for i, v in enumerate(vertex_names)}
            edge_list = [(vertex_to_idx[u], vertex_to_idx[v]) for u, v in G_fund.edges()]
            
            if edge_list:
                ig_G = ig.Graph(n=len(vertex_names), edges=edge_list)
                ig_G.vs['_nx_name'] = vertex_names
                partition = la.find_partition(ig_G, la.ModularityVertexPartition)
                communities = {ig_G.vs[i]['_nx_name']: p for p, cl in enumerate(partition) for i in cl}
        except:
            communities = {f: 0 for f in funds}
    
    # Build feature dataframe
    fund_features = pd.DataFrame({
        'fund': funds,
        'degree': [degree_cent.get(f, 0) for f in funds],
        'pagerank': [pagerank_cent.get(f, 0) for f in funds],
        'hub': [hubs.get(f, 0) for f in funds],
        'authority': [authorities.get(f, 0) for f in funds],
        'closeness': [closeness_cent.get(f, 0) for f in funds],
        'community': [communities.get(f, -1) for f in funds]
    }).set_index('fund')
    
    return fund_features

print("Feature computation function ready.")

## 5. GraphSAGE Embeddings
Train GraphSAGE on training window to generate node embeddings.

In [None]:
class GraphSAGE(torch.nn.Module):
    """2-layer GraphSAGE model for bipartite graphs."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

def train_graphsage_window(G_bip, num_epochs=30, embedding_dim=8, pretrained_model=None, fine_tune_lr=0.001):
    """
    Train GraphSAGE on bipartite graph with optional transfer learning.
    
    Args:
        G_bip: Bipartite graph
        num_epochs: Training epochs
        embedding_dim: Output embedding dimension
        pretrained_model: Optional pre-trained GraphSAGE model (for transfer learning)
        fine_tune_lr: Learning rate for fine-tuning (lower than training from scratch)
    
    Returns:
        (model, embeddings_numpy, funds, stocks)
    """
    nodes = list(G_bip.nodes())
    node_to_idx = {n: i for i, n in enumerate(nodes)}
    
    # Separate funds and stocks
    funds = [n for n in nodes if G_bip.nodes[n].get('bipartite') == 0]
    stocks = [n for n in nodes if G_bip.nodes[n].get('bipartite') == 1]
    
    num_nodes = len(nodes)
    
    # Initialize features on GPU
    x = torch.randn(num_nodes, 16, device=device)
    
    # Build edge index
    edge_list = [(node_to_idx[u], node_to_idx[v]) for u, v in G_bip.edges()]
    if not edge_list:
        print("    WARNING: Graph has no edges")
        return None, np.zeros((num_nodes, embedding_dim)), funds, stocks
    
    edge_index = torch.tensor(edge_list, dtype=torch.long, device=device).t().contiguous()
    
    # Add reverse edges for undirected graph
    edge_index = torch.cat([edge_index, edge_index[[1, 0]]], dim=1)
    
    # Model: use pretrained if available, otherwise create new
    if pretrained_model is not None:
        print("     → Using pretrained model (transfer learning)")
        model = pretrained_model.to(device)
        learning_rate = fine_tune_lr
    else:
        print("     → Training from scratch")
        model = GraphSAGE(16, 32, embedding_dim).to(device)
        learning_rate = 0.01
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training
    model.train()
    prev_loss = float('inf')
    patience, no_improve = 5, 0
    
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(x, edge_index)
        
        # Link prediction loss
        pos_score = (out[edge_index[0]] * out[edge_index[1]]).sum(dim=1).sigmoid()
        loss = -torch.log(pos_score + 1e-15).mean()
        loss.backward()
        optimizer.step()
        
        if abs(prev_loss - loss.item()) < 1e-6:
            no_improve += 1
            if no_improve >= patience:
                break
        else:
            no_improve = 0
        prev_loss = loss.item()
    
    # Extract embeddings
    model.eval()
    with torch.no_grad():
        emb = model(x, edge_index).cpu().numpy()
    
    torch.cuda.empty_cache()
    
    return model, emb, funds, stocks

print("GraphSAGE model with transfer learning defined.")

## 6. Link Prediction Features & Negative Sampling
Build training/test features for link prediction with proper negative sampling.

In [None]:
def create_link_prediction_features(G_bip_train, G_bip_test, embeddings_train, 
                                    fund_features, funds, stocks, fund_to_idx, stock_to_idx):
    """
    Create feature matrix for link prediction.
    
    Training edges: from G_bip_train (label=1) + hard negatives (label=0)
    Test edges: from G_bip_test (label=1) + negatives not in G_bip_train (label=0)
    
    Args:
        G_bip_train: Training bipartite graph
        G_bip_test: Test bipartite graph (for positive labels only)
        embeddings_train: Node embeddings (from GraphSAGE)
        fund_features: DataFrame with fund topological features
        funds, stocks: Lists of nodes
        fund_to_idx, stock_to_idx: Node to index mappings
    
    Returns:
        X_train, y_train, X_test, y_test
    """
    fund_emb = embeddings_train[:len(funds)]
    stock_emb = embeddings_train[len(funds):]
    
    # ── TRAINING DATA ──
    # Positive edges from training graph
    pos_edges_train = [
        (fund_to_idx[u], stock_to_idx[v]) 
        for u, v in G_bip_train.edges() 
        if u in fund_to_idx and v in stock_to_idx
    ]
    
    # Hard negative sampling (use stock similarity)
    stock_sim = cosine_similarity(stock_emb)
    neg_edges_train = []
    
    for f_idx in range(len(funds)):
        fund_id = funds[f_idx]
        # Get connected stocks in training graph
        connected_stocks = {stock_to_idx[s] for s in G_bip_train.neighbors(fund_id) 
                           if s in stock_to_idx}
        
        if not connected_stocks:
            continue
        
        # Average similarity to connected stocks
        connected_list = list(connected_stocks)
        avg_sim = stock_sim[connected_list].mean(axis=0)
        
        # Hard negatives: high similarity but not connected
        hard_negs = np.argsort(-avg_sim)
        hard_neg_list = [
            s_idx for s_idx in hard_negs 
            if s_idx not in connected_stocks and len(neg_edges_train) < len(pos_edges_train)
        ]
        
        neg_edges_train.extend([(f_idx, s_idx) for s_idx in hard_neg_list[:20]])
    
    neg_edges_train = neg_edges_train[:len(pos_edges_train)]  # Balance classes
    
    # ── TEST DATA ──
    # Positive edges from test graph (only edges we didn't see in training)
    test_edges_train = set((fund_to_idx[u], stock_to_idx[v]) 
                          for u, v in G_bip_train.edges() 
                          if u in fund_to_idx and v in stock_to_idx)
    
    pos_edges_test = [
        (fund_to_idx[u], stock_to_idx[v]) 
        for u, v in G_bip_test.edges() 
        if u in fund_to_idx and v in stock_to_idx and (fund_to_idx[u], stock_to_idx[v]) not in test_edges_train
    ]
    
    # Test negatives: not in training OR test graphs
    all_possible = set((i, j) for i in range(len(funds)) for j in range(len(stocks)))
    test_edges_all = test_edges_train | set(pos_edges_test)
    neg_edges_test = list(all_possible - test_edges_all)
    neg_edges_test = neg_edges_test[:max(len(pos_edges_test), 1)]
    
    # Build feature vectors
    def build_features(edge_list):
        features = []
        for f_idx, s_idx in edge_list:
            fund_id = funds[f_idx]
            feat = np.concatenate([
                fund_emb[f_idx],
                stock_emb[s_idx],
                fund_features.loc[fund_id].values if fund_id in fund_features.index else np.zeros(6)
            ])
            features.append(feat)
        return np.array(features) if features else np.zeros((0, fund_emb.shape[1] + stock_emb.shape[1] + 6))
    
    X_train = np.vstack([
        build_features(pos_edges_train),
        build_features(neg_edges_train)
    ])
    y_train = np.hstack([np.ones(len(pos_edges_train)), np.zeros(len(neg_edges_train))])
    
    X_test = np.vstack([
        build_features(pos_edges_test),
        build_features(neg_edges_test)
    ])
    y_test = np.hstack([np.ones(len(pos_edges_test)), np.zeros(len(neg_edges_test))])
    
    return X_train, y_train, X_test, y_test

print("Link prediction feature builder ready.")

## 7. Temporal Link Prediction: Sliding Window Evaluation (2021-2024)
Evaluate model per quarter with configurable training window and strict temporal causality.

In [None]:
# Configuration for sliding window
TRAIN_WINDOW = 3  # Number of quarters to train on
TEST_OFFSET = 1   # How many quarters ahead to test

print(f"Sliding window configuration:")
print(f"  Training window: {TRAIN_WINDOW} quarters")
print(f"  Test offset: {TEST_OFFSET} quarter(s) ahead")

import pickle
import os

# Create directory for saved models
models_dir = 'temporal_models'
os.makedirs(models_dir, exist_ok=True)
print(f"  Models will be saved to: {models_dir}/")

In [None]:
print("=" * 100)
print(f"TEMPORAL LINK PREDICTION: SLIDING WINDOW EVALUATION (2021-2024)")
print(f"Train window: {TRAIN_WINDOW} quarters | Test offset: {TEST_OFFSET} quarter(s)")
print("=" * 100)

results_per_quarter = []
chrono_quarters = get_chronological_quarters(quarterly_graphs)
pretrained_graphsage = None  # Will store previous window's model

# Run sliding window evaluation with transfer learning
for window_idx, (train_quarters, test_quarter) in enumerate(
    get_sliding_window_splits(chrono_quarters, train_window=TRAIN_WINDOW, test_offset=TEST_OFFSET)
):
    test_year, test_quarter_num = test_quarter
    train_label = ' → '.join([f"{y}Q{q}" for y, q in train_quarters])
    
    print(f"\n{'─' * 100}")
    print(f"WINDOW {window_idx + 1} | TEST: {test_year}Q{test_quarter_num}")
    print(f"TRAIN: {train_label}")
    print(f"{'─' * 100}")
    
    try:
        # 1. Build COMBINED training graph from all training quarters
        print("  1. Building combined training graph...")
        G_bip_train = build_combined_training_graph(quarterly_graphs, train_quarters)
        funds_train = [n for n in G_bip_train.nodes() if G_bip_train.nodes[n].get('bipartite') == 0]
        stocks_train = [n for n in G_bip_train.nodes() if G_bip_train.nodes[n].get('bipartite') == 1]
        
        print(f"     Funds: {len(funds_train):,} | Stocks: {len(stocks_train):,} | Edges: {G_bip_train.number_of_edges():,}")
        
        if G_bip_train.number_of_nodes() == 0:
            print("     WARNING: Training graph is empty, skipping...")
            continue
        
        # 2. Compute topological features
        print("  2. Computing topological features...")
        fund_features = compute_fund_features(G_bip_train, funds_train)
        print(f"     Feature shape: {fund_features.shape}")
        
        # 3. Train GraphSAGE (with transfer learning from previous window)
        print("  3. Training GraphSAGE embeddings...")
        if pretrained_graphsage is not None:
            print(f"     → Using pretrained model (transfer learning from window {window_idx})")
            graphsage_model, embeddings, funds_sage, stocks_sage = train_graphsage_window(
                G_bip_train, num_epochs=30, embedding_dim=8, 
                pretrained_model=pretrained_graphsage, fine_tune_lr=0.001
            )
        else:
            print("     → Training from scratch")
            graphsage_model, embeddings, funds_sage, stocks_sage = train_graphsage_window(
                G_bip_train, num_epochs=30, embedding_dim=8, 
                pretrained_model=None
            )
        
        print(f"     Embeddings shape: {embeddings.shape}")
        
        # 4. Get TEST quarter graph (individual quarter, not aggregated)
        print("  4. Loading test quarter graph...")
        G_bip_test = quarterly_graphs.get(test_quarter)
        if G_bip_test is None:
            print(f"     WARNING: Test quarter {test_quarter} not found, skipping...")
            continue
        
        funds_test = [n for n in G_bip_test.nodes() if G_bip_test.nodes[n].get('bipartite') == 0]
        stocks_test = [n for n in G_bip_test.nodes() if G_bip_test.nodes[n].get('bipartite') == 1]
        print(f"     Test funds: {len(funds_test):,} | Test stocks: {len(stocks_test):,} | Edges: {G_bip_test.number_of_edges():,}")
        
        # 5. Create mappings and link prediction features
        print("  5. Creating link prediction features...")
        fund_to_idx = {f: i for i, f in enumerate(funds_train)}
        stock_to_idx = {s: i for i, s in enumerate(stocks_train)}
        
        X_train, y_train, X_test, y_test = create_link_prediction_features(
            G_bip_train, G_bip_test, embeddings, fund_features,
            funds_train, stocks_train, fund_to_idx, stock_to_idx
        )
        print(f"     Train: {X_train.shape[0]:,} samples (pos: {y_train.sum():.0f}, neg: {(1-y_train).sum():.0f})")
        print(f"     Test:  {X_test.shape[0]:,} samples (pos: {y_test.sum():.0f}, neg: {(1-y_test).sum():.0f})")
        
        if X_train.shape[0] == 0 or X_test.shape[0] == 0:
            print("     WARNING: No training or test samples, skipping...")
            continue
        
        # 6. Train LightGBM
        print("  6. Training LightGBM...")
        train_data_lgb = lgb.Dataset(X_train, label=y_train)
        
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'verbose': -1
        }
        
        bst = lgb.train(params, train_data_lgb, num_boost_round=100, valid_sets=[train_data_lgb])
        
        # 7. Evaluate
        print("  7. Evaluating...")
        y_pred = bst.predict(X_test)
        
        auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, (y_pred > 0.5).astype(int), zero_division=0)
        recall = recall_score(y_test, (y_pred > 0.5).astype(int), zero_division=0)
        
        print(f"\n     ✓ AUC:       {auc:.4f}")
        print(f"     ✓ Precision: {precision:.4f}")
        print(f"     ✓ Recall:    {recall:.4f}")
        
        # 8. Save models to pickle
        print("  8. Saving models to pickle...")
        model_filename = f"window_{window_idx+1}_{train_label.replace(' → ', '_')}_test_{test_year}Q{test_quarter_num}.pkl"
        model_path = os.path.join(models_dir, model_filename)
        
        model_data = {
            'window': window_idx + 1,
            'graphsage_model': graphsage_model,
            'lgb_model': bst,
            'embeddings': embeddings,
            'fund_features': fund_features,
            'fund_to_idx': fund_to_idx,
            'stock_to_idx': stock_to_idx,
            'funds_train': funds_train,
            'stocks_train': stocks_train,
            'train_quarters': train_quarters,
            'test_quarter': test_quarter,
            'metrics': {
                'auc': auc,
                'precision': precision,
                'recall': recall
            }
        }
        
        with open(model_path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"     ✓ Saved to: {model_path}")
        
        # Save GraphSAGE model for next window (transfer learning)
        pretrained_graphsage = graphsage_model
        
        results_per_quarter.append({
            'window': window_idx + 1,
            'test_year': test_year,
            'test_quarter': test_quarter_num,
            'train_quarters': train_label,
            'n_train_funds': len(funds_train),
            'n_train_stocks': len(stocks_train),
            'n_test_funds': len(funds_test),
            'n_test_stocks': len(stocks_test),
            'auc': auc,
            'precision': precision,
            'recall': recall,
            'n_test_samples': X_test.shape[0],
            'model_path': model_path,
            'transfer_learned': window_idx > 0
        })
        
    except Exception as e:
        print(f"  ERROR: {e}")
        import traceback
        traceback.print_exc()
        continue

# Summary
print(f"\n\n{'=' * 100}")
print(f"EVALUATION SUMMARY (2021-2024, {TRAIN_WINDOW}-Quarter Training Window)")
print(f"{'=' * 100}")

if results_per_quarter:
    results_df = pd.DataFrame(results_per_quarter)
    print(f"\nResults across {len(results_df)} windows:\n")
    display_cols = ['window', 'train_quarters', 'test_year', 'test_quarter', 'transfer_learned', 'auc', 'precision', 'recall']
    print(results_df[display_cols].to_string(index=False))
    
    print(f"\n\nAggregate Statistics:")
    print(f"  Average AUC:       {results_df['auc'].mean():.4f} (±{results_df['auc'].std():.4f})")
    print(f"  Average Precision: {results_df['precision'].mean():.4f} (±{results_df['precision'].std():.4f})")
    print(f"  Average Recall:    {results_df['recall'].mean():.4f} (±{results_df['recall'].std():.4f})")
    
    print(f"\n\nBy Year:")
    by_year = results_df.groupby('test_year')[['auc', 'precision', 'recall']].mean()
    print(by_year)
    
    results_df.to_csv('temporal_link_prediction_results.csv', index=False)
    print(f"\n✓ Results saved to temporal_link_prediction_results.csv")
    print(f"✓ Models saved to {models_dir}/ directory ({len(results_df)} model files)")
    print(f"✓ Transfer learning: Windows 2+ use previous window's GraphSAGE model")
else:
    print("No results generated.")