# Stock Market Social Network - Temporal Link Prediction Pipeline

Quarterly Incremental (Online) Link Prediction on fund-stock holdings (2021-2024).

**Key Features:**
- Load holdings data 2021-2024 only
- Build separate bipartite graph per quarter
- Incremental training: train on current quarter, test on next quarter
- Model weights never reset (quarterly online learning)
- Strict temporal causality (no future information leakage)
- Per-quarter evaluation metrics (AUC, Precision, Recall)

## 1. Data Setup and Loading
Load quarterly holdings data from processed parquet files (2021-2024 only).

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import os
import re
import warnings
import glob
import pickle

# Graph libraries
import networkx as nx
from networkx.algorithms import bipartite
from networkx.algorithms.centrality import degree_centrality, closeness_centrality
from networkx.algorithms.link_analysis.pagerank_alg import pagerank
from networkx.algorithms.link_analysis.hits_alg import hits
import igraph as ig
import leidenalg as la

# ML libraries
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import SGDClassifier
import lightgbm as lgb
import joblib

# Deep learning
import torch
from torch_geometric.nn import SAGEConv

warnings.filterwarnings('ignore')

# GPU Setup
def check_cuda_compatibility():
    if not torch.cuda.is_available():
        return False, "CUDA not available"
    try:
        test_tensor = torch.zeros(1).cuda()
        test_tensor = test_tensor + 1
        return True, "CUDA compatible"
    except Exception as e:
        return False, f"CUDA compatibility issue: {str(e)}"

cuda_compatible, cuda_message = check_cuda_compatibility()
print(f"CUDA Status: {cuda_message}")

if cuda_compatible:
    device = torch.device('cuda')
    print(f'GPU: {torch.cuda.get_device_name(0)} | CUDA: {torch.version.cuda}')
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    device = torch.device('cpu')
    print('Using CPU (GPU not available)')

CUDA Status: CUDA compatible
GPU: NVIDIA GeForce GTX 1080 Ti | CUDA: 11.8


In [5]:
# Setup paths
personal_dir = os.path.expanduser('~')
root = os.path.join(personal_dir, 'Social-Network-Stock-Market/Social Network/parquuet_files')
output_dir = os.path.join(root, 'generated_combined_parquet')

print(f"Data directory: {root}")
print(f"Output directory: {output_dir}")

# Load reference data
ticker_map = pd.read_parquet(f"{root}/ticker_to_cusip.parquet")
prices = pd.read_parquet(f"{root}/ticker_prices.parquet")
ticker_map["cusip"] = ticker_map["cusip"].astype(str)
prices["period_start"] = pd.to_datetime(prices["period_start"])

print(f"✓ Ticker map: {ticker_map.shape}")
print(f"✓ Prices: {prices.shape}")

Data directory: /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files
Output directory: /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/generated_combined_parquet
✓ Ticker map: (4571, 5)
✓ Prices: (158011, 4)


In [6]:
# Load all processed quarterly holdings files
print("=" * 80)
print("Loading quarterly holdings data (2021-2024)...")
print("=" * 80)

combined_files = sorted([f for f in os.listdir(output_dir) 
                        if f.startswith('holdings_processed_') and f.endswith('.parquet')])

if not combined_files:
    print("ERROR: No processed files found. Check output_dir path.")
else:
    all_dfs = []
    for file in combined_files:
        df_temp = pd.read_parquet(os.path.join(output_dir, file))
        year = df_temp['YEAR'].iloc[0]
        quarter_str = df_temp['QUARTER'].iloc[0]
        print(f"  ✓ Loaded {file}: {len(df_temp):,} records ({quarter_str})")
        all_dfs.append(df_temp)
    
    data = pd.concat(all_dfs, ignore_index=True)
    data['PERIOD_DATE'] = pd.to_datetime(data['PERIOD_DATE'])
    
    # Filter to 2021-2024 ONLY
    data = data[(data['YEAR'] >= 2021) & (data['YEAR'] <= 2024)].copy()
    
    print(f"\n{'─' * 80}")
    print(f"Total records (2021-2024): {len(data):,}")
    print(f"Date range: {data['PERIOD_DATE'].min()} to {data['PERIOD_DATE'].max()}")
    print(f"Years: {sorted(data['YEAR'].unique())}")
    print(f"Unique funds (CIK): {data['CIK'].nunique():,}")
    print(f"Unique stocks (CUSIP): {data['CUSIP'].nunique():,}")

Loading quarterly holdings data (2021-2024)...
  ✓ Loaded holdings_processed_Q1_2014.parquet: 1,017,686 records (Q1_2014)
  ✓ Loaded holdings_processed_Q1_2015.parquet: 1,119,642 records (Q1_2015)
  ✓ Loaded holdings_processed_Q1_2016.parquet: 1,131,465 records (Q1_2016)
  ✓ Loaded holdings_processed_Q1_2017.parquet: 1,190,709 records (Q1_2017)
  ✓ Loaded holdings_processed_Q1_2018.parquet: 1,294,867 records (Q1_2018)
  ✓ Loaded holdings_processed_Q1_2019.parquet: 1,337,369 records (Q1_2019)
  ✓ Loaded holdings_processed_Q1_2020.parquet: 1,304,800 records (Q1_2020)
  ✓ Loaded holdings_processed_Q1_2021.parquet: 1,425,771 records (Q1_2021)
  ✓ Loaded holdings_processed_Q1_2022.parquet: 1,557,935 records (Q1_2022)
  ✓ Loaded holdings_processed_Q1_2023.parquet: 1,615,015 records (Q1_2023)
  ✓ Loaded holdings_processed_Q1_2024.parquet: 1,736,830 records (Q1_2024)
  ✓ Loaded holdings_processed_Q1_2025.parquet: 1,878,074 records (Q1_2025)
  ✓ Loaded holdings_processed_Q2_2013.parquet: 19,659

## 2. Quarterly Graph Construction
Build separate bipartite graphs for each quarter from 2021-2024.

In [7]:
def build_quarterly_graphs(data):
    """
    Build separate bipartite graphs for each quarter.
    Data should already be filtered to desired time range.
    
    Args:
        data: DataFrame with columns [CIK, CUSIP, VALUE, SSHPRNAMT, PERIOD_DATE, YEAR, QUARTER]
    
    Returns:
        Dictionary: {(year, quarter): bipartite_graph}
    """
    quarterly_graphs = {}
    
    # Group by YEAR and extract quarter from QUARTER column
    for (year, quarter_str), group in data.groupby(['YEAR', 'QUARTER']):
        quarter = int(quarter_str.split('_')[0][1])  # Extract Q number from "Q1_2020"
        
        funds = group['CIK'].unique()
        stocks = group['CUSIP'].unique()
        
        # Build bipartite graph
        G_bip = nx.Graph()
        G_bip.add_nodes_from(funds, bipartite=0, node_type='fund')
        G_bip.add_nodes_from(stocks, bipartite=1, node_type='stock')
        
        # Add edges with VALUE weight
        edges = [
            (row.CIK, row.CUSIP, {'value': row.VALUE, 'amount': row.SSHPRNAMT})
            for row in group.itertuples(index=False)
        ]
        G_bip.add_edges_from(edges)
        
        quarterly_graphs[(year, quarter)] = G_bip
        print(f"  {year} Q{quarter}: {len(funds):,} funds, {len(stocks):,} stocks, {G_bip.number_of_edges():,} edges")
    
    return quarterly_graphs

print("Building quarterly bipartite graphs (2021-2024 only)...")
quarterly_graphs = build_quarterly_graphs(data)
print(f"\nTotal quarters: {len(quarterly_graphs)}")
if quarterly_graphs:
    min_q, max_q = min(quarterly_graphs.keys()), max(quarterly_graphs.keys())
    print(f"Date range: {min_q} to {max_q}")

Building quarterly bipartite graphs (2021-2024 only)...
  2021 Q1: 5,773 funds, 3,544 stocks, 943,803 edges
  2021 Q2: 5,803 funds, 3,606 stocks, 961,813 edges
  2021 Q3: 5,786 funds, 3,542 stocks, 967,133 edges
  2021 Q4: 6,598 funds, 3,341 stocks, 1,032,260 edges
  2022 Q1: 6,586 funds, 3,341 stocks, 1,028,454 edges
  2022 Q2: 6,546 funds, 3,308 stocks, 998,793 edges
  2022 Q3: 6,529 funds, 3,285 stocks, 988,140 edges
  2022 Q4: 6,762 funds, 3,255 stocks, 1,024,286 edges
  2023 Q1: 6,673 funds, 3,227 stocks, 1,016,406 edges
  2023 Q2: 6,704 funds, 3,225 stocks, 1,020,601 edges
  2023 Q3: 6,704 funds, 3,202 stocks, 1,020,311 edges
  2023 Q4: 7,082 funds, 3,179 stocks, 1,074,087 edges
  2024 Q1: 7,072 funds, 3,183 stocks, 1,094,025 edges
  2024 Q2: 7,056 funds, 3,164 stocks, 1,100,224 edges
  2024 Q3: 7,023 funds, 3,123 stocks, 1,118,543 edges

Total quarters: 15
Date range: (np.int64(2021), 1) to (np.int64(2024), 3)


## 3. Graph Features: Centrality & Community Detection
Compute topological features from current quarter graph only (no future leakage).

In [8]:
def compute_fund_features(G_bip, funds):
    """
    Compute topological features for funds from bipartite graph.
    Features computed only from G_bip (no future information).
    
    Args:
        G_bip: Bipartite graph (fund-stock holdings)
        funds: List of fund CIKs
    
    Returns:
        DataFrame with features: degree, pagerank, hub, authority, closeness, community
    """
    if len(funds) == 0:
        return pd.DataFrame()
    
    # Project to fund-fund graph (shared stock holdings)
    try:
        G_fund = bipartite.weighted_projected_graph(G_bip, funds)
    except:
        G_fund = nx.Graph()
        G_fund.add_nodes_from(funds)
    
    # Centrality metrics
    degree_cent = degree_centrality(G_fund) if G_fund.number_of_nodes() > 0 else {}
    pagerank_cent = nx.pagerank(G_fund) if G_fund.number_of_nodes() > 0 else {}
    
    try:
        hubs, authorities = hits(G_fund)
    except:
        hubs = {f: 0 for f in funds}
        authorities = {f: 0 for f in funds}
    
    # Closeness on largest component
    closeness_cent = {}
    if G_fund.number_of_nodes() > 0:
        try:
            comps = list(nx.connected_components(G_fund))
            if comps:
                largest_cc = max(comps, key=len)
                closeness_cent = closeness_centrality(G_fund.subgraph(largest_cc))
        except:
            pass
    
    # Community detection (Leiden algorithm)
    communities = {}
    if G_fund.number_of_nodes() > 1:
        try:
            vertex_names = list(G_fund.nodes())
            vertex_to_idx = {v: i for i, v in enumerate(vertex_names)}
            edge_list = [(vertex_to_idx[u], vertex_to_idx[v]) for u, v in G_fund.edges()]
            
            if edge_list:
                ig_G = ig.Graph(n=len(vertex_names), edges=edge_list)
                ig_G.vs['_nx_name'] = vertex_names
                partition = la.find_partition(ig_G, la.ModularityVertexPartition)
                communities = {ig_G.vs[i]['_nx_name']: p for p, cl in enumerate(partition) for i in cl}
        except:
            communities = {f: 0 for f in funds}
    
    # Build feature dataframe
    fund_features = pd.DataFrame({
        'fund': funds,
        'degree': [degree_cent.get(f, 0) for f in funds],
        'pagerank': [pagerank_cent.get(f, 0) for f in funds],
        'hub': [hubs.get(f, 0) for f in funds],
        'authority': [authorities.get(f, 0) for f in funds],
        'closeness': [closeness_cent.get(f, 0) for f in funds],
        'community': [communities.get(f, -1) for f in funds]
    }).set_index('fund')
    
    return fund_features

print("Feature computation function ready.")

Feature computation function ready.


In [None]:
class GraphSAGE(torch.nn.Module):
    """2-layer GraphSAGE model for bipartite graphs."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

print("GraphSAGE model defined.")

GraphSAGE model with transfer learning defined.


In [None]:
def create_incremental_link_features(G_bip_current, G_bip_future, embeddings_current, 
                                     fund_features, funds_all, stocks_all, 
                                     fund_to_idx, stock_to_idx, cumulative_edges):
    """
    Create feature matrix for QUARTERLY INCREMENTAL link prediction.
    
    Training: Use ONLY current quarter's edges + hard negatives from unseen pairs.
    Testing: Evaluate on next quarter; features computed from cumulative history only.
    
    Args:
        G_bip_current: Current quarter's bipartite graph (training edges only)
        G_bip_future: Next quarter's bipartite graph (test labels only)
        embeddings_current: Node embeddings from fine-tuned GraphSAGE
        fund_features: Topological features (from current quarter graph only)
        funds_all, stocks_all: Complete node lists
        fund_to_idx, stock_to_idx: Node to index mappings
        cumulative_edges: Set of all edges seen up to and including current quarter
    
    Returns:
        X_train, y_train, X_test, y_test, updated_cumulative_edges
    """
    fund_emb = embeddings_current[:len(funds_all)]
    stock_emb = embeddings_current[len(funds_all):]
    
    # ── TRAINING DATA (from CURRENT quarter only) ──
    pos_edges_train = [
        (fund_to_idx[u], stock_to_idx[v]) 
        for u, v in G_bip_current.edges() 
        if u in fund_to_idx and v in stock_to_idx
    ]
    
    # Hard negative sampling: high similarity but never seen in cumulative history
    stock_sim = cosine_similarity(stock_emb)
    neg_edges_train = []
    
    for f_idx in range(len(funds_all)):
        fund_id = funds_all[f_idx]
        connected_stocks_current = {stock_to_idx[s] for s in G_bip_current.neighbors(fund_id) 
                                   if s in stock_to_idx}
        
        if not connected_stocks_current:
            continue
        
        connected_list = list(connected_stocks_current)
        avg_sim = stock_sim[connected_list].mean(axis=0)
        
        hard_negs = np.argsort(-avg_sim)
        hard_neg_list = [
            s_idx for s_idx in hard_negs 
            if (fund_id, stocks_all[s_idx]) not in cumulative_edges
            and len(neg_edges_train) < len(pos_edges_train)
        ]
        
        neg_edges_train.extend([(f_idx, s_idx) for s_idx in hard_neg_list[:20]])
    
    neg_edges_train = neg_edges_train[:len(pos_edges_train)]
    
    # ── TEST DATA (evaluate on next quarter) ──
    pos_edges_test = [
        (fund_to_idx[u], stock_to_idx[v]) 
        for u, v in G_bip_future.edges() 
        if u in fund_to_idx and v in stock_to_idx and (u, v) not in cumulative_edges
    ]
    
    all_possible = set((i, j) for i in range(len(funds_all)) for j in range(len(stocks_all)))
    test_negatives_candidates = [
        (i, j) for i, j in all_possible 
        if (funds_all[i], stocks_all[j]) not in cumulative_edges
    ]
    neg_edges_test = test_negatives_candidates[:max(len(pos_edges_test), 1)]
    
    # Build feature vectors
    def build_features(edge_list):
        features = []
        for f_idx, s_idx in edge_list:
            fund_id = funds_all[f_idx]
            feat = np.concatenate([
                fund_emb[f_idx],
                stock_emb[s_idx],
                fund_features.loc[fund_id].values if fund_id in fund_features.index else np.zeros(6)
            ])
            features.append(feat)
        return np.array(features) if features else np.zeros((0, fund_emb.shape[1] + stock_emb.shape[1] + 6))
    
    X_train = np.vstack([
        build_features(pos_edges_train),
        build_features(neg_edges_train)
    ])
    y_train = np.hstack([np.ones(len(pos_edges_train)), np.zeros(len(neg_edges_train))])
    
    X_test = np.vstack([
        build_features(pos_edges_test),
        build_features(neg_edges_test)
    ])
    y_test = np.hstack([np.ones(len(pos_edges_test)), np.zeros(len(neg_edges_test))])
    
    # Update cumulative edges with current quarter's edges
    updated_cumulative_edges = cumulative_edges.copy()
    for u, v in G_bip_current.edges():
        if u in fund_to_idx and v in stock_to_idx:
            updated_cumulative_edges.add((u, v))
    
    return X_train, y_train, X_test, y_test, updated_cumulative_edges

print("Incremental link prediction feature builder ready.")

Link prediction feature builder ready.


In [11]:
def fine_tune_graphsage_quarterly(G_bip_current, graphsage_model, num_epochs=10, learning_rate=0.001):
    """
    Fine-tune GraphSAGE incrementally on current quarter.
    Preserves weights from previous quarters (no resets).
    
    Args:
        G_bip_current: Current quarter's bipartite graph (ONLY this quarter)
        graphsage_model: Existing GraphSAGE model (None for Q1)
        num_epochs: Fine-tuning iterations (lower for incremental updates)
        learning_rate: Learning rate for fine-tuning
    
    Returns:
        (updated_model, embeddings_numpy, funds, stocks)
    """
    nodes = list(G_bip_current.nodes())
    node_to_idx = {n: i for i, n in enumerate(nodes)}
    
    funds = [n for n in nodes if G_bip_current.nodes[n].get('bipartite') == 0]
    stocks = [n for n in nodes if G_bip_current.nodes[n].get('bipartite') == 1]
    
    num_nodes = len(nodes)
    x = torch.randn(num_nodes, 16, device=device)
    
    edge_list = [(node_to_idx[u], node_to_idx[v]) for u, v in G_bip_current.edges()]
    if not edge_list:
        print("    WARNING: Graph has no edges")
        return graphsage_model, np.zeros((num_nodes, 8)), funds, stocks
    
    edge_index = torch.tensor(edge_list, dtype=torch.long, device=device).t().contiguous()
    edge_index = torch.cat([edge_index, edge_index[[1, 0]]], dim=1)
    
    if graphsage_model is not None:
        print("     → Fine-tuning existing model (weights preserved)")
        model = graphsage_model.to(device)
    else:
        print("     → Training new model (first quarter)")
        model = GraphSAGE(16, 32, 8).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.train()
    
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(x, edge_index)
        pos_score = (out[edge_index[0]] * out[edge_index[1]]).sum(dim=1).sigmoid()
        loss = -torch.log(pos_score + 1e-15).mean()
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        emb = model(x, edge_index).cpu().numpy()
    
    torch.cuda.empty_cache()
    
    return model, emb, funds, stocks

print("Fine-tuning GraphSAGE function (incremental) ready.")

Fine-tuning GraphSAGE function (incremental) ready.


In [12]:
class IncrementalLinkPredictor:
    """
    Incremental link predictor using SGDClassifier.
    Supports partial_fit for quarterly online learning without weight resets.
    """
    def __init__(self, random_state=42):
        self.model = None
        self.random_state = random_state
        self.n_features_ = None
        self.classes_ = np.array([0, 1])
        self.training_iterations = 0
    
    def partial_fit(self, X, y):
        """
        Update model on new quarter's data (incremental learning).
        First call: initialize. Subsequent calls: update weights without reset.
        """
        if self.model is None:
            # Initialize on first quarter
            self.model = SGDClassifier(
                loss='log',
                n_jobs=-1,
                random_state=self.random_state,
                warm_start=False,
                max_iter=100,
                tol=1e-3
            )
            self.model.fit(X, y)
            self.n_features_ = X.shape[1]
            print(f"     → Initialized model: {X.shape[0]} samples, {X.shape[1]} features")
        else:
            # Incremental update: warm_start=True preserves weights
            self.model.warm_start = True
            self.model.max_iter = 10
            self.model.partial_fit(X, y, classes=self.classes_)
            print(f"     → Updated model: {X.shape[0]} new samples")
        
        self.training_iterations += 1
    
    def predict_proba(self, X):
        """Predict probability of positive class."""
        if self.model is None:
            raise ValueError("Model not trained yet")
        return self.model.predict_proba(X)[:, 1]
    
    def get_params(self):
        """Return model for serialization."""
        return self.model if self.model is not None else None

print("IncrementalLinkPredictor class ready.")

IncrementalLinkPredictor class ready.


In [None]:
# Configuration for INCREMENTAL/ONLINE quarterly training
INCREMENTAL_TRAINING = True

print(f"Incremental Training Configuration:")
print(f"  Mode: Quarterly Online/Incremental Learning")
print(f"  Model: SGDClassifier (partial_fit) + fine-tuned GraphSAGE")
print(f"  Strategy: Update model each quarter, never reset weights")
print(f"  Graph construction: Current quarter ONLY (no cumulative union)")

# Create directory for saved checkpoints
models_dir = 'incremental_models'
checkpoints_dir = os.path.join(models_dir, 'checkpoints')
os.makedirs(checkpoints_dir, exist_ok=True)
print(f"  Checkpoints saved to: {checkpoints_dir}/")

def get_chronological_quarters(quarterly_graphs):
    """Get all quarters in chronological order."""
    return sorted(quarterly_graphs.keys())

print("\nIncremental learning utilities initialized.")

In [None]:
print("=" * 100)
print(f"QUARTERLY INCREMENTAL LINK PREDICTION TRAINING (2021-2024)")
print("=" * 100)

results_per_quarter = []
chrono_quarters = get_chronological_quarters(quarterly_graphs)

# ════════════════════════════════════════════════════════════════════════════════
# INITIALIZE INCREMENTAL COMPONENTS ONCE (OUTSIDE THE LOOP)
# Model is created here and updated quarter-by-quarter without reset
# ════════════════════════════════════════════════════════════════════════════════

graphsage_model = None  # Will be fine-tuned each quarter
link_predictor = IncrementalLinkPredictor(random_state=42)  # Initialized ONCE
cumulative_edges = set()  # Track all edges seen cumulatively

print(f"Total quarters: {len(chrono_quarters)}")
print(f"Quarters: {chrono_quarters}\n")
print("✓ Incremental model initialized (will be updated each quarter, never reset)")
print("✓ GraphSAGE will be fine-tuned incrementally")
print("✓ SGDClassifier will be updated with partial_fit\n")

# ════════════════════════════════════════════════════════════════════════════════
# MAIN INCREMENTAL LOOP: Train on current quarter, test on next quarter
# ════════════════════════════════════════════════════════════════════════════════

for quarter_idx, current_quarter in enumerate(chrono_quarters):
    current_year, current_quarter_num = current_quarter
    
    # Check if next quarter exists for testing
    if quarter_idx + 1 >= len(chrono_quarters):
        print(f"\n{'─' * 100}")
        print(f"QUARTER {quarter_idx + 1}/{len(chrono_quarters)} | {current_year}Q{current_quarter_num}")
        print(f"(No next quarter for testing, skipping)")
        break
    
    next_quarter = chrono_quarters[quarter_idx + 1]
    next_year, next_quarter_num = next_quarter
    
    print(f"\n{'─' * 100}")
    print(f"QUARTER {quarter_idx + 1}/{len(chrono_quarters)}")
    print(f"  Train: {current_year}Q{current_quarter_num} | Test: {next_year}Q{next_quarter_num}")
    print(f"  Cumulative edges seen: {len(cumulative_edges):,}")
    print(f"  Model training iterations so far: {link_predictor.training_iterations}")
    print(f"{'─' * 100}")
    
    try:
        # 1. Load current quarter graph (ONLY this quarter, no union)
        print("  1. Loading current quarter graph...")
        G_bip_current = quarterly_graphs[current_quarter]
        funds_current = [n for n in G_bip_current.nodes() if G_bip_current.nodes[n].get('bipartite') == 0]
        stocks_current = [n for n in G_bip_current.nodes() if G_bip_current.nodes[n].get('bipartite') == 1]
        
        print(f"     Funds: {len(funds_current):,} | Stocks: {len(stocks_current):,} | Edges: {G_bip_current.number_of_edges():,}")
        
        if G_bip_current.number_of_nodes() == 0:
            print("     WARNING: Graph is empty, skipping...")
            continue
        
        # 2. Fine-tune GraphSAGE on current quarter (incremental update)
        print("  2. Fine-tuning GraphSAGE...")
        graphsage_model, embeddings, funds_sage, stocks_sage = fine_tune_graphsage_quarterly(
            G_bip_current, graphsage_model, num_epochs=10, learning_rate=0.001
        )
        print(f"     Embeddings: {embeddings.shape}")
        
        # 3. Compute topological features (current quarter only)
        print("  3. Computing topological features...")
        fund_features = compute_fund_features(G_bip_current, funds_sage)
        print(f"     Features: {fund_features.shape}")
        
        # 4. Node index mappings
        fund_to_idx = {f: i for i, f in enumerate(funds_sage)}
        stock_to_idx = {s: i for i, s in enumerate(stocks_sage)}
        
        # 5. Create incremental features
        print("  4. Creating incremental features...")
        G_bip_next = quarterly_graphs.get(next_quarter)
        if G_bip_next is None:
            print(f"     WARNING: Next quarter not found, skipping...")
            continue
        
        X_train, y_train, X_test, y_test, cumulative_edges = create_incremental_link_features(
            G_bip_current, G_bip_next, embeddings, fund_features,
            funds_sage, stocks_sage, fund_to_idx, stock_to_idx, cumulative_edges
        )
        print(f"     Train: {X_train.shape[0]:,} (pos: {y_train.sum():.0f})")
        print(f"     Test:  {X_test.shape[0]:,} (pos: {y_test.sum():.0f})")
        
        if X_train.shape[0] == 0 or X_test.shape[0] == 0:
            print("     WARNING: No samples, skipping...")
            continue
        
        # 6. INCREMENTAL model update (partial_fit) - NO RESET, NO RE-INITIALIZATION
        print("  5. Incremental model update (partial_fit)...")
        link_predictor.partial_fit(X_train, y_train)
        print(f"     Model training iterations now: {link_predictor.training_iterations}")
        
        # 7. Evaluate on next quarter
        print("  6. Evaluating on next quarter...")
        y_pred = link_predictor.predict_proba(X_test)
        
        auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, (y_pred > 0.5).astype(int), zero_division=0)
        recall = recall_score(y_test, (y_pred > 0.5).astype(int), zero_division=0)
        
        print(f"\n     ✓ AUC:       {auc:.4f}")
        print(f"     ✓ Precision: {precision:.4f}")
        print(f"     ✓ Recall:    {recall:.4f}")
        
        # 8. Save checkpoint
        print("  7. Saving checkpoint...")
        checkpoint_filename = f"checkpoint_q{quarter_idx+1}_{current_year}Q{current_quarter_num}.pkl"
        checkpoint_path = os.path.join(checkpoints_dir, checkpoint_filename)
        
        checkpoint_data = {
            'quarter_index': quarter_idx + 1,
            'current_quarter': current_quarter,
            'next_quarter': next_quarter,
            'graphsage_model': graphsage_model,
            'link_predictor_model': link_predictor.get_params(),
            'cumulative_edges': cumulative_edges,
            'fund_to_idx': fund_to_idx,
            'stock_to_idx': stock_to_idx,
            'funds_sage': funds_sage,
            'stocks_sage': stocks_sage,
            'training_iterations': link_predictor.training_iterations,
            'metrics': {
                'auc': auc,
                'precision': precision,
                'recall': recall
            }
        }
        
        with open(checkpoint_path, 'wb') as f:
            pickle.dump(checkpoint_data, f)
        print(f"     ✓ Saved to: {checkpoint_path}")
        
        results_per_quarter.append({
            'quarter_idx': quarter_idx + 1,
            'train_year': current_year,
            'train_quarter': current_quarter_num,
            'test_year': next_year,
            'test_quarter': next_quarter_num,
            'n_train_funds': len(funds_current),
            'n_train_stocks': len(stocks_current),
            'train_edges': G_bip_current.number_of_edges(),
            'test_edges': G_bip_next.number_of_edges(),
            'cumulative_edges': len(cumulative_edges),
            'auc': auc,
            'precision': precision,
            'recall': recall,
            'n_test_samples': X_test.shape[0],
            'checkpoint_path': checkpoint_path,
            'training_iterations': link_predictor.training_iterations
        })
        
    except Exception as e:
        print(f"  ERROR: {e}")
        import traceback
        traceback.print_exc()
        continue

# ════════════════════════════════════════════════════════════════════════════════
# SAVE FINAL TRAINED MODEL TO DISK
# ════════════════════════════════════════════════════════════════════════════════

print(f"\n\n{'=' * 100}")
print(f"SAVING FINAL INCREMENTAL MODEL")
print(f"{'=' * 100}")

if link_predictor.training_iterations > 0:
    # Save the final trained model
    final_model_path = 'final_incremental_model.pkl'
    
    final_model_data = {
        'model': link_predictor.get_params(),
        'total_training_iterations': link_predictor.training_iterations,
        'cumulative_edges_final': cumulative_edges,
        'graphsage_model': graphsage_model,
        'training_completed': True,
        'description': 'Final incremental link predictor trained on all quarters (2021-2024)',
        'timestamp': pd.Timestamp.now().isoformat()
    }
    
    with open(final_model_path, 'wb') as f:
        pickle.dump(final_model_data, f)
    
    print(f"\n✓ Final model saved to: {final_model_path}")
    print(f"  - Model type: SGDClassifier (warm_start enabled)")
    print(f"  - Total training iterations: {link_predictor.training_iterations}")
    print(f"  - Cumulative edges learned: {len(cumulative_edges):,}")
    print(f"  - Model weights: ACCUMULATED over all {len(chrono_quarters)} quarters")
    print(f"  - Status: READY FOR INFERENCE")
else:
    print("\n⚠ No training completed, final model not saved")

# ════════════════════════════════════════════════════════════════════════════════
# SUMMARY REPORT
# ════════════════════════════════════════════════════════════════════════════════

print(f"\n\n{'=' * 100}")
print(f"INCREMENTAL LEARNING SUMMARY (2021-2024)")
print(f"{'=' * 100}")

if results_per_quarter:
    results_df = pd.DataFrame(results_per_quarter)
    print(f"\nResults across {len(results_df)} evaluation windows:\n")
    display_cols = ['quarter_idx', 'train_year', 'train_quarter', 'test_year', 'test_quarter', 
                   'training_iterations', 'auc', 'precision', 'recall']
    print(results_df[display_cols].to_string(index=False))
    
    print(f"\n\nAggregate Statistics:")
    print(f"  Average AUC:       {results_df['auc'].mean():.4f} (±{results_df['auc'].std():.4f})")
    print(f"  Average Precision: {results_df['precision'].mean():.4f} (±{results_df['precision'].std():.4f})")
    print(f"  Average Recall:    {results_df['recall'].mean():.4f} (±{results_df['recall'].std():.4f})")
    
    print(f"\n\nBy Test Year:")
    by_year = results_df.groupby('test_year')[['auc', 'precision', 'recall']].mean()
    print(by_year)
    
    print(f"\n\nTraining Progression:")
    print(results_df[['quarter_idx', 'train_year', 'train_quarter', 'training_iterations']].to_string(index=False))
    
    results_df.to_csv('incremental_link_prediction_results.csv', index=False)
    print(f"\n✓ Results saved to incremental_link_prediction_results.csv")
    print(f"✓ {len(results_df)} checkpoints saved to {checkpoints_dir}/")
    print(f"✓ Final model saved to: final_incremental_model.pkl")
    print(f"✓ Model weights accumulated over {len(chrono_quarters)} quarters (NEVER RESET)")
else:
    print("No results generated.")

QUARTERLY INCREMENTAL LINK PREDICTION TRAINING (2021-2024)
Total quarters: 15
Quarters: [(np.int64(2021), 1), (np.int64(2021), 2), (np.int64(2021), 3), (np.int64(2021), 4), (np.int64(2022), 1), (np.int64(2022), 2), (np.int64(2022), 3), (np.int64(2022), 4), (np.int64(2023), 1), (np.int64(2023), 2), (np.int64(2023), 3), (np.int64(2023), 4), (np.int64(2024), 1), (np.int64(2024), 2), (np.int64(2024), 3)]


────────────────────────────────────────────────────────────────────────────────────────────────────
QUARTER 1/15
  Train: 2021Q1 | Test: 2021Q2
  Cumulative edges seen: 0
────────────────────────────────────────────────────────────────────────────────────────────────────
  1. Loading current quarter graph...
     Funds: 5,773 | Stocks: 3,544 | Edges: 943,803
  2. Fine-tuning GraphSAGE...
     → Training new model (first quarter)
     Embeddings: (9317, 8)
  3. Computing topological features...


: 

## 6. Results Analysis
Analyze incremental learning performance across quarters.

In [None]:
# Load and display incremental results
import os

print("=" * 100)
print("INCREMENTAL LEARNING RESULTS ANALYSIS")
print("=" * 100)

incremental_results = None

if os.path.exists('incremental_link_prediction_results.csv'):
    incremental_results = pd.read_csv('incremental_link_prediction_results.csv')
    print(f"\n✓ Loaded results: {len(incremental_results)} quarters evaluated\n")
    
    print("Performance by Quarter:")
    print(incremental_results[['quarter_idx', 'train_year', 'train_quarter', 
                               'test_year', 'test_quarter', 'auc', 'precision', 'recall']].to_string(index=False))
    
    print(f"\n\nOverall Statistics:")
    print(f"  AUC:       {incremental_results['auc'].mean():.4f} ± {incremental_results['auc'].std():.4f}")
    print(f"  Precision: {incremental_results['precision'].mean():.4f} ± {incremental_results['precision'].std():.4f}")
    print(f"  Recall:    {incremental_results['recall'].mean():.4f} ± {incremental_results['recall'].std():.4f}")
    
    print(f"\n\nModel Knowledge Accumulation:")
    print(f"  Total quarters processed: {incremental_results['quarter_idx'].max()}")
    print(f"  Final training iterations: {incremental_results['training_iterations'].iloc[-1]}")
    print(f"  Cumulative edges by final quarter: {incremental_results['cumulative_edges'].iloc[-1]:,}")
else:
    print("⚠ Results file not found: incremental_link_prediction_results.csv")