# Stock Market Social Network Pipeline

---

## 1. Import Libraries and Setup

In [2]:
# Core libraries
import pandas as pd
import numpy as np
import os
import random
from datetime import datetime
import warnings
import zipfile
import shutil

# Graph libraries
import networkx as nx
from networkx.algorithms import bipartite
from networkx.algorithms.centrality import degree_centrality, closeness_centrality
from networkx.algorithms.link_analysis.pagerank_alg import pagerank
from networkx.algorithms.link_analysis.hits_alg import hits
import igraph as ig
import leidenalg as la

# Machine learning libraries
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
import joblib
import pickle

# Deep learning libraries
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

warnings.filterwarnings('ignore')

# GPU Setup with compatibility checks
def check_cuda_compatibility():
    if not torch.cuda.is_available():
        return False, "CUDA not available"
    
    try:
        # Test basic CUDA operation
        test_tensor = torch.zeros(1).cuda()
        test_tensor = test_tensor + 1
        return True, "CUDA compatible"
    except Exception as e:
        return False, f"CUDA compatibility issue: {str(e)}"

cuda_compatible, cuda_message = check_cuda_compatibility()
print(f"CUDA Status: {cuda_message}")

if cuda_compatible:
    device = torch.device('cuda')
    print(f'Using device: {device}')
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'CUDA Version: {torch.version.cuda}')
    print(f'PyTorch Version: {torch.__version__}')
    # Set default tensor type to CUDA
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    device = torch.device('cpu')
    print(f'Using device: {device} (CPU fallback)')
    print('Note: For GPU support, ensure PyTorch is installed with CUDA support matching your GPU')
    print('Install with: pip install torch --index-url https://download.pytorch.org/whl/cu118')


CUDA Status: CUDA compatibility issue: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Using device: cpu (CPU fallback)
Note: For GPU support, ensure PyTorch is installed with CUDA support matching your GPU
Install with: pip install torch --index-url https://download.pytorch.org/whl/cu118


## 1.1. Extract Parquet Files from Zip Archives
Extract all zip files in the parquet_files directory and remove the zip files after extraction.

In [3]:
# Extract zip files from parquet_files directory
persoanl_dir = os.path.expanduser('~')
parquet_dir = os.path.join(persoanl_dir, 'Social-Network-Stock-Market/Social Network/parquet_files')
if os.path.exists(parquet_dir):
    zip_files = [f for f in os.listdir(parquet_dir) if f.endswith('.zip')]
    
    if zip_files:
        print(f'Found {len(zip_files)} zip file(s) in {parquet_dir}')
        
        for zip_file in zip_files:
            zip_path = os.path.join(parquet_dir, zip_file)
            print(f'Extracting {zip_file}...')
            
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(parquet_dir)
                print(f'  ✓ Extracted {zip_file}')
                
                # Delete the zip file after extraction
                os.remove(zip_path)
                print(f'  ✓ Deleted {zip_file}')
                
            except Exception as e:
                print(f'  ✗ Error processing {zip_file}: {e}')
        
        print('All zip files processed and cleaned up.')
    else:
        print(f'No zip files found in {parquet_dir}')
else:
    print(f'Directory not found: {parquet_dir}')
    print('Creating directory...')
    os.makedirs(parquet_dir, exist_ok=True)

No zip files found in /home/zenoua/Social-Network-Stock-Market/Social Network/parquet_files


## 2. Data Loading and Cleaning
Load and preprocess the raw fund-stock holding data.

##### Creating new parquet files

In [4]:
# Load and clean the data

root = os.path.join(persoanl_dir, 'Social-Network-Stock-Market/Social Network/parquuet_files')
print(f"{root}/ticker_to_cusip.parquet")
ticker_map = pd.read_parquet(f"{root}/ticker_to_cusip.parquet")
prices = pd.read_parquet(f"{root}/ticker_prices.parquet")
output_dir = f"{root}/generated_combined_parquet"
os.makedirs(output_dir, exist_ok=True)

/home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/ticker_to_cusip.parquet


In [5]:
import glob
import pandas as pd

# Check the structure of your files
print("=" * 80)
print("INSPECTING INPUT FILES")
print("=" * 80)

# 1. Check ticker_map structure
print("\n1. TICKER_MAP (ticker_to_cusip.parquet):")
print("-" * 80)
try:
    ticker_map_sample = pd.read_parquet(f"{root}/ticker_to_cusip.parquet")
    print(f"Shape: {ticker_map_sample.shape}")
    print(f"Columns: {list(ticker_map_sample.columns)}")
    print("\nFirst 5 rows:")
    print(ticker_map_sample.head())
    print(f"\nData types:\n{ticker_map_sample.dtypes}")
except Exception as e:
    print(f"Error reading ticker_map: {e}")

# 2. Check prices structure
print("\n\n2. PRICES (ticker_prices.parquet):")
print("-" * 80)
try:
    prices_sample = pd.read_parquet(f"{root}/ticker_prices.parquet")
    print(f"Shape: {prices_sample.shape}")
    print(f"Columns: {list(prices_sample.columns)}")
    print("\nFirst 5 rows:")
    print(prices_sample.head())
    print(f"\nData types:\n{prices_sample.dtypes}")
except Exception as e:
    print(f"Error reading prices: {e}")

# 3. Check first holdings file structure
print("\n\n3. SAMPLE HOLDINGS FILE:")
print("-" * 80)
holdings_pattern = os.path.join(root, "holdings_filtered_new_period_start_*.parquet")
holdings_files = sorted(glob.glob(holdings_pattern))

if holdings_files:
    sample_file = holdings_files[0]
    print(f"File: {os.path.basename(sample_file)}")
    try:
        holdings_sample = pd.read_parquet(sample_file)
        print(f"Shape: {holdings_sample.shape}")
        print(f"Columns: {list(holdings_sample.columns)}")
        print("\nFirst 5 rows:")
        print(holdings_sample.head())
        print(f"\nData types:\n{holdings_sample.dtypes}")
    except Exception as e:
        print(f"Error reading holdings: {e}")
else:
    print("No holdings files found!")

print("\n" + "=" * 80)
print("INSPECTION COMPLETE")
print("=" * 80)

INSPECTING INPUT FILES

1. TICKER_MAP (ticker_to_cusip.parquet):
--------------------------------------------------------------------------------
Shape: (4571, 5)
Columns: ['name', 'cusip', 'ticker', 'trading_start_date', 'trading_end_date']

First 5 rows:
              name      cusip ticker trading_start_date trading_end_date
0  A. Schulman Inc  808194104   SHLM         1973-01-09       2018-08-20
1   A.H. Belo Corp  001282102    AHC               None             None
2  A.O. Smith Corp  831865209    AOS         1983-09-30       2026-01-06
3         AAON Inc  000360206   AAON         1991-01-03       2026-01-06
4         AAR Corp  000361105    AIR         1980-03-17       2026-01-06

Data types:
name                  object
cusip                 object
ticker                object
trading_start_date    object
trading_end_date      object
dtype: object


2. PRICES (ticker_prices.parquet):
--------------------------------------------------------------------------------
Shape: (158011,

##### Iterating on all years

In [6]:
import glob
import re
from datetime import datetime

# Get all holdings parquet files sorted by date
ticker_map["cusip"] = ticker_map["cusip"].astype(str)
prices["period_start"] = pd.to_datetime(prices["period_start"])
holdings_pattern = os.path.join(root, "holdings_filtered_new_period_start_*.parquet")
holdings_files = sorted(glob.glob(holdings_pattern))

print(f"Found {len(holdings_files)} holdings files")

# Extract year and quarter from each file
def extract_date_info(filepath):
    """Extract period_start date from filename"""
    match = re.search(r'period_start_(\d{4}-\d{2}-\d{2})', filepath)
    if match:
        date_str = match.group(1)
        date_obj = pd.to_datetime(date_str)
        year = date_obj.year
        quarter = (date_obj.month - 1) // 3 + 1
        return date_obj, year, quarter
    return None, None, None

# Process all files
all_dfs = []

for file_path in holdings_files:
    period_date, year, quarter = extract_date_info(file_path)
    
    if period_date is None:
        print(f"Skipping file with unrecognized format: {file_path}")
        continue
    
    print(f"Processing {year} Q{quarter} (period_start: {period_date.date()})...")
    
    try:
        # Read holdings data
        df = pd.read_parquet(file_path)
        
        # Rename to required format
        df = df.rename(columns={
            "cik": "CIK",
            "cusip": "CUSIP",
            "sshprnamt": "SSHPRNAMT",
            "period_start": "PERIOD_DATE"
        })
        
        # Type normalization
        df["CIK"] = df["CIK"].astype(str)
        df["CUSIP"] = df["CUSIP"].astype(str)
        df["PERIOD_DATE"] = pd.to_datetime(df["PERIOD_DATE"])
        
        # Join: CUSIP → ticker
        df = df.merge(
            ticker_map[["cusip", "ticker"]],
            left_on="CUSIP",
            right_on="cusip",
            how="left"
        ).drop(columns=["cusip"])
        
        # Join: ticker + period → price
        df = df.merge(
            prices[["ticker", "period_start", "price"]],
            left_on=["ticker", "PERIOD_DATE"],
            right_on=["ticker", "period_start"],
            how="left"
        ).drop(columns=["period_start"])
        
        # Compute VALUE
        df["VALUE"] = df["SSHPRNAMT"] * df["price"]
        
        # Add metadata columns
        df["YEAR"] = year
        df["QUARTER"] = f"Q{quarter}_{year}"
        
        # Final column order
        df = df[["CIK", "CUSIP", "VALUE", "SSHPRNAMT", "PERIOD_DATE", "YEAR", "QUARTER"]]
        
        # Save individual parquet file
        out_path = os.path.join(
            output_dir,
            f"holdings_processed_Q{quarter}_{year}.parquet"
        )
        df.to_parquet(out_path, index=False)
        # print(df.head(10))
        print(f"  ✓ Saved {len(df)} records to {out_path}")
        
        # Add to list for combined file
        all_dfs.append(df)
        
    except Exception as e:
        print(f"  ✗ Error processing {file_path}: {e}")
        continue



Found 49 holdings files
Processing 2013 Q2 (period_start: 2013-04-01)...
  ✓ Saved 19659 records to /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/generated_combined_parquet/holdings_processed_Q2_2013.parquet
Processing 2013 Q3 (period_start: 2013-07-01)...
  ✓ Saved 34213 records to /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/generated_combined_parquet/holdings_processed_Q3_2013.parquet
Processing 2013 Q4 (period_start: 2013-10-01)...
  ✓ Saved 1018601 records to /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/generated_combined_parquet/holdings_processed_Q4_2013.parquet
Processing 2014 Q1 (period_start: 2014-01-01)...
  ✓ Saved 1017686 records to /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/generated_combined_parquet/holdings_processed_Q1_2014.parquet
Processing 2014 Q2 (period_start: 2014-04-01)...
  ✓ Saved 1060398 records to /home/zenoua/Social-Network-Stock-Market/Social Network/pa

In [7]:

# dfs = []
# for i, file in enumerate(file_paths):
#     df = pd.read_csv(file)
#     df['QUARTER'] = f'Q{i+1}_2018'
#     dfs.append(df)
# data = pd.concat(dfs, ignore_index=True)
# data = data[['CIK', 'CUSIP', 'VALUE', 'SSHPRNAMT', 'PERIOD_DATE', 'QUARTER']]
# data['PERIOD_DATE'] = pd.to_datetime(data['PERIOD_DATE'])
# data = data.dropna(subset=['CIK', 'CUSIP', 'VALUE'])
# data['CIK'] = data['CIK'].astype(str)
# data['CUSIP'] = data['CUSIP'].astype(str)
# data = data.sort_values(by='PERIOD_DATE')
# print(f'Total records after cleaning: {len(data)}')

## 3. Graph Construction
Build bipartite and projected graphs for funds and stocks using sliding window approach.

**Training Strategy:**
- Initial training: 2015-2019 (all quarters)
- Sliding window testing: 2020 Q1 through 2024 Q4
- Each quarter uses all previous data for training
- 2025 data excluded from analysis

In [8]:
def build_graph_and_features_up_to(df, max_year, max_quarter):
    """
    Build graph and features up to specified year and quarter.
    
    Args:
        df: Combined dataframe with all holdings data
        max_year: Maximum year to include (inclusive)
        max_quarter: Maximum quarter to include for max_year (1-4)
    
    Returns:
        Bipartite graph, fund-fund graph, fund features, filtered data, fund list, stock list
    """
    # Filter data up to specified year/quarter
    df_up_to = df[
        (df['YEAR'] < max_year) | 
        ((df['YEAR'] == max_year) & (df['QUARTER'].str.extract(r'Q(\d+)')[0].astype(int) <= max_quarter))
    ].copy()
    
    if len(df_up_to) == 0:
        raise ValueError(f"No data available up to {max_year} Q{max_quarter}")
    
    funds_up_to = df_up_to['CIK'].unique()
    stocks_up_to = df_up_to['CUSIP'].unique()
    
    # Build bipartite graph
    G_bip = nx.Graph()
    G_bip.add_nodes_from(funds_up_to, bipartite=0)
    G_bip.add_nodes_from(stocks_up_to, bipartite=1)
    
    edges = [(row.CIK, row.CUSIP, 
            {'value': row.VALUE, 'amount': row.SSHPRNAMT, 'time': row.PERIOD_DATE})
            for row in df_up_to.itertuples(index=False)]
    G_bip.add_edges_from(edges)
        
    # Fund-Fund projection with weights
    G_fund = bipartite.weighted_projected_graph(G_bip, funds_up_to)
    
    # Convert to directed based on time
    G_fund_directed = nx.DiGraph()
    for u, v, data_dict in G_fund.edges(data=True):
        shared = set(G_bip.neighbors(u)) & set(G_bip.neighbors(v))
        if not shared:
            continue
        
        times_u = [G_bip.edges[u,s]['time'] for s in shared]
        times_v = [G_bip.edges[v,s]['time'] for s in shared]
        avg_u = np.mean([t.timestamp() for t in times_u])
        avg_v = np.mean([t.timestamp() for t in times_v])
        
        weight = data_dict.get('weight', 1)
        
        if avg_u < avg_v:
            G_fund_directed.add_edge(u, v, weight=weight)
        else:
            G_fund_directed.add_edge(v, u, weight=weight)
    
    G_fund = G_fund_directed
    
    # Topological features
    degree_cent = degree_centrality(G_fund)
    pagerank_cent = nx.pagerank(G_fund)
    hubs, authorities = hits(G_fund)
    
    # Closeness centrality (on largest connected component)
    if G_fund.number_of_nodes() > 0:
        largest_cc = max(nx.connected_components(G_fund.to_undirected()), key=len)
        closeness_cent = closeness_centrality(G_fund.to_undirected().subgraph(largest_cc))
    else:
        closeness_cent = {}
    
    fund_features = pd.DataFrame({
        'fund': list(G_fund.nodes()),
        'degree': [degree_cent.get(n, 0) for n in G_fund.nodes()],
        'pagerank': [pagerank_cent.get(n, 0) for n in G_fund.nodes()],
        'hub': [hubs.get(n, 0) for n in G_fund.nodes()],
        'authority': [authorities.get(n, 0) for n in G_fund.nodes()],
        'closeness': [closeness_cent.get(n, 0) for n in G_fund.nodes()]
    }).set_index('fund')
    
    # Community detection (Leiden)
    if G_fund.number_of_nodes() > 0:
        vertex_names = list(G_fund.nodes())
        vertex_to_idx = {v: i for i, v in enumerate(vertex_names)}
        edge_list = [(vertex_to_idx[u], vertex_to_idx[v]) 
                    for u, v in G_fund.to_undirected().edges()]
        
        ig_G = ig.Graph(n=len(vertex_names), edges=edge_list)
        ig_G.vs['_nx_name'] = vertex_names
        partition = la.find_partition(ig_G, la.ModularityVertexPartition)
        communities = {ig_G.vs[i]['_nx_name']: p for p, cl in enumerate(partition) for i in cl}
        fund_features['community'] = fund_features.index.map(communities).fillna(-1)
    else:
        fund_features['community'] = -1
    
    return G_bip, G_fund, fund_features, df_up_to, funds_up_to, stocks_up_to

In [9]:
# Load combined data from generated parquet files (exclude 2025)
print("Loading combined holdings data...")
combined_files = sorted([f for f in os.listdir(output_dir) if f.startswith('holdings_processed_') and f.endswith('.parquet')])
print(f"Found {len(combined_files)} processed files")

# Load and concatenate all data
all_data = []
for file in combined_files:
    df_temp = pd.read_parquet(os.path.join(output_dir, file))
    all_data.append(df_temp)

data = pd.concat(all_data, ignore_index=True)
data['PERIOD_DATE'] = pd.to_datetime(data['PERIOD_DATE'])

# Exclude 2025 data
data = data[data['YEAR'] < 2025].copy()

print(f"Total records loaded: {len(data):,}")
print(f"Date range: {data['PERIOD_DATE'].min()} to {data['PERIOD_DATE'].max()}")
print(f"Years covered: {sorted(data['YEAR'].unique())}")
print(f"Unique funds: {data['CIK'].nunique()}")
print(f"Unique stocks: {data['CUSIP'].nunique()}")

# Define training and test periods
INITIAL_TRAIN_START_YEAR = 2015
INITIAL_TRAIN_END_YEAR = 2019
TEST_START_YEAR = 2020
TEST_END_YEAR = 2024

print(f"\nTraining period: {INITIAL_TRAIN_START_YEAR}-{INITIAL_TRAIN_END_YEAR}")
print(f"Testing period: {TEST_START_YEAR}-{TEST_END_YEAR} (sliding window by quarter)")

Loading combined holdings data...
Found 49 processed files
Total records loaded: 59,099,780
Date range: 2013-04-01 00:00:00 to 2024-10-01 00:00:00
Years covered: [np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
Unique funds: 10588
Unique stocks: 4554

Training period: 2015-2019
Testing period: 2020-2024 (sliding window by quarter)


## 4. Initial Training Phase (2015-2019)
Build initial model using all data from 2015-2019.

In [10]:
# Build initial training graph using 2015-2019 data
print("Building initial training graph (2015-2019)...")
G_bip_train, G_fund_train, fund_features_train, df_train, funds_train, stocks_train = \
    build_graph_and_features_up_to(data, INITIAL_TRAIN_END_YEAR, 4)

fund_idx_train = {f: i for i, f in enumerate(funds_train)}
stock_idx_train = {s: i for i, s in enumerate(stocks_train)}

print(f"Initial training graph statistics:")
print(f"  Bipartite edges: {G_bip_train.number_of_edges():,}")
print(f"  Fund-fund edges: {G_fund_train.number_of_edges():,}")
print(f"  Number of funds: {len(funds_train):,}")
print(f"  Number of stocks: {len(stocks_train):,}")
print(f"  Date range: {df_train['PERIOD_DATE'].min()} to {df_train['PERIOD_DATE'].max()}")

Building initial training graph (2015-2019)...


: 

## 5. GraphSAGE Embedding Training (Initial Model)
Train GraphSAGE on 2015-2019 data to generate initial node embeddings.

In [None]:
# Node features initialization - ensure GPU usage
num_nodes = len(funds_train) + len(stocks_train)

# Initialize on GPU directly
x = torch.randn(num_nodes, 16, device=device)
print(f"Node features initialized on {device} with shape {x.shape}")

# Build homogeneous edge index for initial training
edge_index_homo = []
for u, v in G_bip_train.edges():
    u_idx = fund_idx_train.get(u, len(funds_train) + stock_idx_train.get(u, -1))
    v_idx = fund_idx_train.get(v, len(funds_train) + stock_idx_train.get(v, -1))
    if u_idx < num_nodes and v_idx < num_nodes:
        edge_index_homo.append([u_idx, v_idx])
        edge_index_homo.append([v_idx, u_idx])

# Create edge index directly on GPU
edge_index_homo = torch.tensor(edge_index_homo, dtype=torch.long, device=device).t().contiguous()
print(f"Edge index initialized on {device} with {edge_index_homo.shape[1]:,} edges")

# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Initialize model on GPU
model = GraphSAGE(16, 32, 8).to(device)
print(f"Model initialized on {device}")
print(f"Model parameters on GPU: {next(model.parameters()).is_cuda}")

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train GraphSAGE with early stopping
model.train()
prev_loss = float('inf')
patience = 5
no_improve = 0

print(f"\nStarting initial GraphSAGE training on {device}...")
for epoch in range(50):
    optimizer.zero_grad()
    out = model(x, edge_index_homo)
    pos_score = (out[edge_index_homo[0]] * out[edge_index_homo[1]]).sum(dim=1).sigmoid()
    loss = -torch.log(pos_score + 1e-15).mean()
    loss.backward()
    optimizer.step()
    
    if abs(prev_loss - loss.item()) < 1e-6:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
    else:
        no_improve = 0
    prev_loss = loss.item()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}, GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB")

# Extract initial embeddings (keep computation on GPU, only move result to CPU)
model.eval()
with torch.no_grad():
    emb = model(x, edge_index_homo).cpu().numpy()

dynamic_emb_train = emb[:len(funds_train)]
stock_emb_train = emb[len(funds_train):]

print(f"\nInitial GraphSAGE training completed on {device}.")
print(f"Fund embeddings shape: {dynamic_emb_train.shape}")
print(f"Stock embeddings shape: {stock_emb_train.shape}")

Node features initialized on cpu
Edge index initialized on cpu
Model initialized on cpu
Starting training on cpu...


Epoch 0, Loss: 0.3611
Epoch 10, Loss: 0.0000
Early stopping at epoch 11
GraphSAGE training completed on cpu.
dynamic_emb_train shape: (9, 8)
stock_emb_train shape: (1891, 8)


## 6. Save Training Artifacts
Save embeddings, features, and model for later use.

In [None]:
# ── Cache trained artifacts (run ONCE) ───────────────────────────────────────
GRAPH_READY = True
_cached_graph = G_bip_train
_cached_embeddings = stock_emb_train
_cached_model = model.cpu()  # Move model to CPU for caching


In [None]:
# Save embeddings, model, and graph to files for later use
import joblib
import pickle
import os

# Create a directory for artifacts if it doesn't exist
os.makedirs('artifacts', exist_ok=True)

# Save stock embeddings
np.save('artifacts/stock_emb_train.npy', stock_emb_train)

# Save dynamic fund embeddings
np.save('artifacts/dynamic_emb_train.npy', dynamic_emb_train)

# Save fund features
fund_features_train.to_pickle('artifacts/fund_features_train.pkl')

# Save LightGBM model (bst) if it exists
if 'bst' in globals():
    joblib.dump(bst, 'artifacts/lightgbm_bst.pkl')

# Save funds and stocks lists
with open('artifacts/funds_train.pkl', 'wb') as f:
    pickle.dump(funds_train, f)
with open('artifacts/stocks_train.pkl', 'wb') as f:
    pickle.dump(stocks_train, f)

print('Artifacts saved to artifacts/ directory.')

Artifacts saved to artifacts/ directory.


## 6. Sliding Window Evaluation (2020-2024)
Evaluate model performance using sliding window approach across all quarters from 2020-2024.

For each quarter:
1. Use all data up to previous quarter for training
2. Predict holdings for current quarter
3. Evaluate predictions

In [None]:
# Sliding window evaluation on 2020-2024
def evaluate_quarter(data, train_year, train_quarter, test_year, test_quarter, 
                     model, fund_features_base, funds_base, stocks_base, stock_emb_base):
    """
    Evaluate model on a specific test quarter using training data up to previous quarter.
    """
    # Build training graph up to (and including) train_quarter
    G_bip_win, G_fund_win, fund_features_win, df_win, funds_win, stocks_win = \
        build_graph_and_features_up_to(data, train_year, train_quarter)
    
    fund_idx_win = {f: i for i, f in enumerate(funds_win)}
    stock_idx_win = {s: i for i, s in enumerate(stocks_win)}
    
    # Update embeddings for this window (fast fine-tuning) - ensure GPU usage
    num_nodes_win = len(funds_win) + len(stocks_win)
    x_win = torch.randn(num_nodes_win, 16, device=device)
    
    edge_index_win = []
    for u, v in G_bip_win.edges():
        u_idx = fund_idx_win.get(u, len(funds_win) + stock_idx_win.get(u, -1))
        v_idx = fund_idx_win.get(v, len(funds_win) + stock_idx_win.get(v, -1))
        if u_idx < num_nodes_win and v_idx < num_nodes_win:
            edge_index_win.append([u_idx, v_idx])
            edge_index_win.append([v_idx, u_idx])
    
    # Create edge index directly on GPU
    edge_index_win = torch.tensor(edge_index_win, dtype=torch.long, device=device).t().contiguous()
    
    # Fine-tune for 10 epochs on GPU
    model.train()
    optimizer_win = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(10):
        optimizer_win.zero_grad()
        out = model(x_win, edge_index_win)
        pos_score = (out[edge_index_win[0]] * out[edge_index_win[1]]).sum(dim=1).sigmoid()
        loss = -torch.log(pos_score + 1e-15).mean()
        loss.backward()
        optimizer_win.step()
    
    model.eval()
    with torch.no_grad():
        emb_win = model(x_win, edge_index_win).cpu().numpy()
    
    dynamic_emb_win = emb_win[:len(funds_win)]
    stock_emb_win = emb_win[len(funds_win):]
    
    # Get test quarter data
    test_data = data[
        (data['YEAR'] == test_year) & 
        (data['QUARTER'].str.extract(r'Q(\d+)')[0].astype(int) == test_quarter)
    ]
    
    funds_test = set(test_data['CIK'].unique()) & set(funds_win)
    stocks_test = set(test_data['CUSIP'].unique()) & set(stocks_win)
    
    # Create positive edges from test data
    pos_edges = []
    for row in test_data.itertuples(index=False):
        if row.CIK in funds_test and row.CUSIP in stocks_test:
            pos_edges.append((fund_idx_win[row.CIK], stock_idx_win[row.CUSIP], 1))
    
    # Negative sampling
    stock_sim_win = cosine_similarity(stock_emb_win)
    fund_to_connected = {
        f_idx: [stock_idx_win[s] for s in G_bip_win.neighbors(fund) 
                if s in stocks_test and s in stock_idx_win]
        for f_idx, fund in enumerate(funds_win) if fund in funds_test
    }
    
    neg_edges = []
    for f_idx in range(len(funds_win)):
        if funds_win[f_idx] not in funds_test:
            continue
        connected = fund_to_connected.get(f_idx, [])
        if not connected:
            continue
        sim_scores = stock_sim_win[connected].mean(axis=0)
        hard_negs = np.argsort(-sim_scores)[len(connected):len(connected)+50]
        neg_edges.extend([
            (f_idx, neg_idx, 0) for neg_idx in hard_negs 
            if neg_idx not in connected and neg_idx < len(stocks_win) 
            and stocks_win[neg_idx] in stocks_test
        ])
    
    neg_edges = neg_edges[:len(pos_edges)]
    
    # Build features
    link_data = []
    for f_i, s_i, label in pos_edges + neg_edges:
        fund_id = funds_win[f_i]
        feat = np.concatenate([
            dynamic_emb_win[f_i],
            stock_emb_win[s_i],
            fund_features_win.loc[fund_id, ['degree', 'pagerank', 'hub', 'authority', 'closeness', 'community']].values
        ])
        link_data.append((feat, label))
    
    X_test = np.array([d[0] for d in link_data])
    y_test = np.array([d[1] for d in link_data])
    
    return X_test, y_test, len(pos_edges), len(neg_edges)

# Run sliding window evaluation
print(f"\nStarting sliding window evaluation (2020-2024) on {device}...")
print("=" * 80)

results = []
for year in range(TEST_START_YEAR, TEST_END_YEAR + 1):
    for quarter in range(1, 5):
        # Training window: all data up to previous quarter
        if quarter == 1:
            train_year, train_quarter = year - 1, 4
        else:
            train_year, train_quarter = year, quarter - 1
        
        # Skip if training period is before our data
        if train_year < INITIAL_TRAIN_START_YEAR:
            continue
        
        print(f"\nEvaluating {year} Q{quarter} (trained on data up to {train_year} Q{train_quarter})...")
        print(f"GPU Memory before evaluation: {torch.cuda.memory_allocated()/1e9:.2f}GB")
        
        try:
            X_test, y_test, n_pos, n_neg = evaluate_quarter(
                data, train_year, train_quarter, year, quarter,
                model, fund_features_train, funds_train, stocks_train, stock_emb_train
            )
            
            print(f"  Test samples: {len(y_test):,} ({n_pos:,} positive, {n_neg:,} negative)")
            
            # Train LightGBM on this window's data and evaluate
            params = {
                'objective': 'binary',
                'metric': 'auc',
                'learning_rate': 0.01,
                'num_leaves': 31,
                'verbose': -1,
                'device': 'gpu',  # Use GPU for LightGBM if available
                'gpu_platform_id': 0,
                'gpu_device_id': 0
            }
            
            # For simplicity, use initial training data for LightGBM
            train_data_lgb = lgb.Dataset(X_test[:len(X_test)//2], label=y_test[:len(y_test)//2])
            bst_win = lgb.train(params, train_data_lgb, num_boost_round=50)
            
            y_pred = bst_win.predict(X_test[len(X_test)//2:])
            y_true = y_test[len(X_test)//2:]
            
            auc = roc_auc_score(y_true, y_pred)
            precision = precision_score(y_true, (y_pred > 0.5).astype(int))
            
            results.append({
                'year': year,
                'quarter': quarter,
                'auc': auc,
                'precision': precision,
                'n_samples': len(y_test)
            })
            
            print(f"  AUC: {auc:.4f}, Precision: {precision:.4f}")
            print(f"  GPU Memory after evaluation: {torch.cuda.memory_allocated()/1e9:.2f}GB")
            
            # Clear GPU cache to prevent memory buildup
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"  Error: {e}")
            import traceback
            traceback.print_exc()
            continue

# Summary statistics
if results:
    results_df = pd.DataFrame(results)
    print("\n" + "=" * 80)
    print("SLIDING WINDOW EVALUATION SUMMARY")
    print("=" * 80)
    print(f"Average AUC: {results_df['auc'].mean():.4f} (±{results_df['auc'].std():.4f})")
    print(f"Average Precision: {results_df['precision'].mean():.4f} (±{results_df['precision'].std():.4f})")
    print(f"\nResults by year:")
    print(results_df.groupby('year')[['auc', 'precision']].mean())

Created 3803 positive and 357 negative test samples
Test set positive ratio: 0.914 (1=positive, 0=negative)


## 9. Prediction Function
Function to predict top stocks for a given fund.

In [None]:
def predict_portfolio(fund_id):
    """
    Efficiently predict top stocks for a given fund_id using precomputed artifacts.
    Assumes all heavy computations (embeddings, features, model) are preloaded and cached.
    """
    global bst, dynamic_emb_train, stock_emb_train, fund_features_train, funds_train, stocks_train
    if 'bst' not in globals() or 'dynamic_emb_train' not in globals() or 'stock_emb_train' not in globals():
        raise RuntimeError("Artifacts not loaded. Please run the training cells or load artifacts.")
    fund_idx = {f: i for i, f in enumerate(funds_train)}
    stock_list = stocks_train
    if fund_id not in fund_idx:
        print(f"Fund with CIK {fund_id} not found in the data.")
        return []
    f_idx = fund_idx[fund_id]
    fund_id_str = funds_train[f_idx]
    fund_emb_repeat = np.tile(dynamic_emb_train[f_idx], (len(stock_list), 1))
    fund_topo_repeat = np.tile(fund_features_train.loc[fund_id_str, ['degree', 'pagerank', 'hub', 'authority', 'closeness', 'community']].values, (len(stock_list), 1))
    feats = np.concatenate([fund_emb_repeat, stock_emb_train, fund_topo_repeat], axis=1)
    preds = bst.predict(feats)
    stock_preds = list(zip(stock_list, preds))
    top_stocks = sorted(stock_preds, key=lambda x: x[1], reverse=True)[:5]
    return top_stocks

In [None]:
# Load all artifacts (embeddings, features, model) for fast prediction
# (imports moved to the first cell)

artifacts_path = 'artifacts'

def load_artifacts():
    global stock_emb_train, dynamic_emb_train, fund_features_train, bst, funds_train, stocks_train
    if os.path.exists(artifacts_path):
        try:
            stock_emb_train = np.load(os.path.join(artifacts_path, 'stock_emb_train.npy'))
            dynamic_emb_train = np.load(os.path.join(artifacts_path, 'dynamic_emb_train.npy'))
            fund_features_train = pd.read_pickle(os.path.join(artifacts_path, 'fund_features_train.pkl'))
            bst = joblib.load(os.path.join(artifacts_path, 'lightgbm_bst.pkl'))
            with open(os.path.join(artifacts_path, 'funds_train.pkl'), 'rb') as f:
                funds_train = pickle.load(f)
            with open(os.path.join(artifacts_path, 'stocks_train.pkl'), 'rb') as f:
                stocks_train = pickle.load(f)
            print('Artifacts loaded successfully. You can now run predictions without retraining.')
        except Exception as e:
            print('Failed to load artifacts:', e)
    else:
        print('Artifacts directory not found. Please run the training cells first.')

# Load artifacts at notebook startup for fast prediction
load_artifacts()

Artifacts loaded successfully. You can now run predictions without retraining.


## 10. Predict for a Specific Fund
Example: Predict for a random out-of-sample fund.

> **Note:**
>
> - To make an unbiased next-quarter prediction, you must use only funds that were seen in training (Q1-Q2) and predict their holdings in Q4.
> - Predicting for funds that were not seen in training is not possible (no embeddings/features for them).
> - Predicting for funds using their Q4 data in training or feature construction would cause data leakage and bias.
> - The code below ensures you only predict for eligible funds, with no leakage.


## 10.1. List Eligible Funds for Next-Quarter Prediction
Print all funds that were seen in training (Q1-Q2) and also appear in Q4. These are the only funds for which you can make an unbiased next-quarter prediction (no data leakage).

In [None]:
# List funds eligible for next-quarter (Q4) prediction: must be seen in train (Q1-Q2) and appear in Q4
if 'funds_train' in globals() and 'funds_q4' in globals():
    eligible_funds = sorted(list(set(funds_train) & set(funds_q4)))
    print(f"Number of eligible funds for Q4 prediction: {len(eligible_funds)}")
    print("Sample of eligible funds (first 20):")
    print(eligible_funds[:20])
    # You can select any fund from this list for next-quarter prediction (no leakage, no bias)
else:
    print('Required fund lists not found. Please ensure all previous cells have been run.')

Number of eligible funds for Q4 prediction: 9
Sample of eligible funds (first 20):
['1021249', '1325091', '1345929', '1424116', '1535839', '1576102', '1623678', '1694461', '1697457']


## 10.1. Random/Specific Fund Next-Quarter Prediction
To choode specific FUND from the list, update the var *fund_id_to_predict*

In [None]:
# Example: Predict for a specific or random eligible fund (seen in train, predict Q4)
fund_id_to_predict = '1535839'  # Set to a specific CIK string to choose a fund, or leave as None for random
fund_list_for_prediction = eligible_funds if 'eligible_funds' in globals() and len(eligible_funds) > 0 else []
if len(fund_list_for_prediction) > 0:
    if fund_id_to_predict is not None and str(fund_id_to_predict) in fund_list_for_prediction:
        selected_fund = str(fund_id_to_predict)
        print(f'Selected eligible fund for Q4 prediction: {selected_fund}')
    else:
        selected_fund = random.choice(fund_list_for_prediction)
        print(f'Randomly selected eligible fund for Q4 prediction: {selected_fund}')
    top_stocks = predict_portfolio(selected_fund)
    print('Top recommended stocks for Q4:')
    for stock, score in top_stocks:
        print(f'  {stock}: {score:.4f}')
else:
    print('No eligible funds available for prediction. Please check your data and artifacts.')

Selected eligible fund for Q4 prediction: 1535839
Top recommended stocks for Q4:
  880349105: 0.9561
  67555N206: 0.9561
  67420T206: 0.9561
  85570W100: 0.9561
  983134107: 0.9561


## 7. Final Model Training (2015-2024)
Train final model on all available data (2015-2024) for production use.

In [None]:
# Train final model on all 2015-2024 data for production deployment
print(f"\nTraining final production model on complete dataset (2015-2024) using {device}...")

# Build complete graph
G_bip_final, G_fund_final, fund_features_final, df_final, funds_final, stocks_final = \
    build_graph_and_features_up_to(data, TEST_END_YEAR, 4)

fund_idx_final = {f: i for i, f in enumerate(funds_final)}
stock_idx_final = {s: i for i, s in enumerate(stocks_final)}

print(f"Final training graph statistics:")
print(f"  Bipartite edges: {G_bip_final.number_of_edges():,}")
print(f"  Fund-fund edges: {G_fund_final.number_of_edges():,}")
print(f"  Number of funds: {len(funds_final):,}")
print(f"  Number of stocks: {len(stocks_final):,}")
print(f"  Date range: {df_final['PERIOD_DATE'].min()} to {df_final['PERIOD_DATE'].max()}")

# Node features initialization - ensure GPU usage
num_nodes_final = len(funds_final) + len(stocks_final)
x_final = torch.randn(num_nodes_final, 16, device=device)
print(f"Node features initialized on {device}")

# Build homogeneous edge index for final training
edge_index_homo_final = []
for u, v in G_bip_final.edges():
    u_idx = fund_idx_final.get(u, len(funds_final) + stock_idx_final.get(u, -1))
    v_idx = fund_idx_final.get(v, len(funds_final) + stock_idx_final.get(v, -1))
    if u_idx < num_nodes_final and v_idx < num_nodes_final:
        edge_index_homo_final.append([u_idx, v_idx])
        edge_index_homo_final.append([v_idx, u_idx])

# Create edge index directly on GPU
edge_index_homo_final = torch.tensor(edge_index_homo_final, dtype=torch.long, device=device).t().contiguous()
print(f"Edge index initialized on {device} with {edge_index_homo_final.shape[1]:,} edges")

# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Initialize model on GPU
model_final = GraphSAGE(16, 32, 8).to(device)
print(f"Model initialized on {device}")
print(f"Model parameters on GPU: {next(model_final.parameters()).is_cuda}")

optimizer_final = torch.optim.Adam(model_final.parameters(), lr=0.01)

# Train GraphSAGE with early stopping
model_final.train()
prev_loss = float('inf')
patience = 5
no_improve = 0

print(f"\nStarting final GraphSAGE training on {device}...")
for epoch in range(50):
    optimizer_final.zero_grad()
    out = model_final(x_final, edge_index_homo_final)
    pos_score = (out[edge_index_homo_final[0]] * out[edge_index_homo_final[1]]).sum(dim=1).sigmoid()
    loss = -torch.log(pos_score + 1e-15).mean()
    loss.backward()
    optimizer_final.step()
    
    if abs(prev_loss - loss.item()) < 1e-6:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
    else:
        no_improve = 0
    prev_loss = loss.item()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}, GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB")

# Extract final embeddings
model_final.eval()
with torch.no_grad():
    emb_final = model_final(x_final, edge_index_homo_final).cpu().numpy()

dynamic_emb_final = emb_final[:len(funds_final)]
stock_emb_final = emb_final[len(funds_final):]

print(f"\nFinal model training completed on {device}.")
print(f"Final model covers {len(funds_final):,} funds and {len(stocks_final):,} stocks")
print(f"Final GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB")

# Clear GPU cache
torch.cuda.empty_cache()