# Stock Market Social Network Pipeline

---

## 1. Import Libraries and Setup

In [5]:
# Core libraries
import pandas as pd
import numpy as np
import os
import random
from datetime import datetime
import warnings
import zipfile
import shutil

# Graph libraries
import networkx as nx
from networkx.algorithms import bipartite
from networkx.algorithms.centrality import degree_centrality, closeness_centrality
from networkx.algorithms.link_analysis.pagerank_alg import pagerank
from networkx.algorithms.link_analysis.hits_alg import hits
import igraph as ig
import leidenalg as la

# Machine learning libraries
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
import joblib
import pickle

# Deep learning libraries
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

warnings.filterwarnings('ignore')

# GPU Setup with compatibility checks
def check_cuda_compatibility():
    if not torch.cuda.is_available():
        return False, "CUDA not available"
    
    try:
        # Test basic CUDA operation
        test_tensor = torch.zeros(1).cuda()
        test_tensor = test_tensor + 1
        return True, "CUDA compatible"
    except Exception as e:
        return False, f"CUDA compatibility issue: {str(e)}"

cuda_compatible, cuda_message = check_cuda_compatibility()
print(f"CUDA Status: {cuda_message}")

if cuda_compatible:
    device = torch.device('cuda')
    print(f'Using device: {device}')
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'CUDA Version: {torch.version.cuda}')
    print(f'PyTorch Version: {torch.__version__}')
else:
    device = torch.device('cpu')
    print(f'Using device: {device} (CPU fallback)')
    print('Note: For GPU support, ensure PyTorch is installed with CUDA support matching your GPU')
    print('Install with: pip install torch --index-url https://download.pytorch.org/whl/cu118')

CUDA Status: CUDA compatibility issue: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Using device: cpu (CPU fallback)
Note: For GPU support, ensure PyTorch is installed with CUDA support matching your GPU
Install with: pip install torch --index-url https://download.pytorch.org/whl/cu118


## 1.1. Extract Parquet Files from Zip Archives
Extract all zip files in the parquet_files directory and remove the zip files after extraction.

In [4]:
# Extract zip files from parquet_files directory
persoanl_dir = os.path.expanduser('~')
parquet_dir = os.path.join(persoanl_dir, 'Social-Network-Stock-Market/Social Network/parquet_files')
if os.path.exists(parquet_dir):
    zip_files = [f for f in os.listdir(parquet_dir) if f.endswith('.zip')]
    
    if zip_files:
        print(f'Found {len(zip_files)} zip file(s) in {parquet_dir}')
        
        for zip_file in zip_files:
            zip_path = os.path.join(parquet_dir, zip_file)
            print(f'Extracting {zip_file}...')
            
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(parquet_dir)
                print(f'  ✓ Extracted {zip_file}')
                
                # Delete the zip file after extraction
                os.remove(zip_path)
                print(f'  ✓ Deleted {zip_file}')
                
            except Exception as e:
                print(f'  ✗ Error processing {zip_file}: {e}')
        
        print('All zip files processed and cleaned up.')
    else:
        print(f'No zip files found in {parquet_dir}')
else:
    print(f'Directory not found: {parquet_dir}')
    print('Creating directory...')
    os.makedirs(parquet_dir, exist_ok=True)

Found 1 zip file(s) in /home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files
Extracting parquet_files.zip...
  ✓ Extracted parquet_files.zip
  ✓ Deleted parquet_files.zip
All zip files processed and cleaned up.


## 2. Data Loading and Cleaning
Load and preprocess the raw fund-stock holding data.

##### Creating new parquet files

In [12]:
# Load and clean the data

root = os.path.join(persoanl_dir, 'Social-Network-Stock-Market/Social Network/parquuet_files')
print(f"{root}/ticker_to_cusip.parquet")
ticker_map = pd.read_parquet(f"{root}/ticker_to_cusip.parquet")
prices = pd.read_parquet(f"{root}/ticker_prices.parquet")
output_dir = f"{root}/generated_combined_parquet"
os.makedirs(output_dir, exist_ok=True)

/home/zenoua/Social-Network-Stock-Market/Social Network/parquuet_files/ticker_to_cusip.parquet


In [14]:
import glob
import pandas as pd

# Check the structure of your files
print("=" * 80)
print("INSPECTING INPUT FILES")
print("=" * 80)

# 1. Check ticker_map structure
print("\n1. TICKER_MAP (ticker_to_cusip.parquet):")
print("-" * 80)
try:
    ticker_map_sample = pd.read_parquet(f"{root}/ticker_to_cusip.parquet")
    print(f"Shape: {ticker_map_sample.shape}")
    print(f"Columns: {list(ticker_map_sample.columns)}")
    print("\nFirst 5 rows:")
    print(ticker_map_sample.head())
    print(f"\nData types:\n{ticker_map_sample.dtypes}")
except Exception as e:
    print(f"Error reading ticker_map: {e}")

# 2. Check prices structure
print("\n\n2. PRICES (ticker_prices.parquet):")
print("-" * 80)
try:
    prices_sample = pd.read_parquet(f"{root}/ticker_prices.parquet")
    print(f"Shape: {prices_sample.shape}")
    print(f"Columns: {list(prices_sample.columns)}")
    print("\nFirst 5 rows:")
    print(prices_sample.head())
    print(f"\nData types:\n{prices_sample.dtypes}")
except Exception as e:
    print(f"Error reading prices: {e}")

# 3. Check first holdings file structure
print("\n\n3. SAMPLE HOLDINGS FILE:")
print("-" * 80)
holdings_pattern = os.path.join(root, "holdings_filtered_new_period_start_*.parquet")
holdings_files = sorted(glob.glob(holdings_pattern))

if holdings_files:
    sample_file = holdings_files[0]
    print(f"File: {os.path.basename(sample_file)}")
    try:
        holdings_sample = pd.read_parquet(sample_file)
        print(f"Shape: {holdings_sample.shape}")
        print(f"Columns: {list(holdings_sample.columns)}")
        print("\nFirst 5 rows:")
        print(holdings_sample.head())
        print(f"\nData types:\n{holdings_sample.dtypes}")
    except Exception as e:
        print(f"Error reading holdings: {e}")
else:
    print("No holdings files found!")

print("\n" + "=" * 80)
print("INSPECTION COMPLETE")
print("=" * 80)

INSPECTING INPUT FILES

1. TICKER_MAP (ticker_to_cusip.parquet):
--------------------------------------------------------------------------------
Shape: (4571, 5)
Columns: ['name', 'cusip', 'ticker', 'trading_start_date', 'trading_end_date']

First 5 rows:
              name      cusip ticker trading_start_date trading_end_date
0  A. Schulman Inc  808194104   SHLM         1973-01-09       2018-08-20
1   A.H. Belo Corp  001282102    AHC               None             None
2  A.O. Smith Corp  831865209    AOS         1983-09-30       2026-01-06
3         AAON Inc  000360206   AAON         1991-01-03       2026-01-06
4         AAR Corp  000361105    AIR         1980-03-17       2026-01-06

Data types:
name                  object
cusip                 object
ticker                object
trading_start_date    object
trading_end_date      object
dtype: object


2. PRICES (ticker_prices.parquet):
--------------------------------------------------------------------------------
Shape: (158011,

##### Iterating on all years

In [None]:
import glob
import re
from datetime import datetime

# Get all holdings parquet files sorted by date
ticker_map["cusip"] = ticker_map["cusip"].astype(str)
prices["period_start"] = pd.to_datetime(prices["period_start"])
holdings_pattern = os.path.join(root, "holdings_filtered_new_period_start_*.parquet")
holdings_files = sorted(glob.glob(holdings_pattern))

print(f"Found {len(holdings_files)} holdings files")

# Extract year and quarter from each file
def extract_date_info(filepath):
    """Extract period_start date from filename"""
    match = re.search(r'period_start_(\d{4}-\d{2}-\d{2})', filepath)
    if match:
        date_str = match.group(1)
        date_obj = pd.to_datetime(date_str)
        year = date_obj.year
        quarter = (date_obj.month - 1) // 3 + 1
        return date_obj, year, quarter
    return None, None, None

# Process all files
all_dfs = []

for file_path in holdings_files:
    period_date, year, quarter = extract_date_info(file_path)
    
    if period_date is None:
        print(f"Skipping file with unrecognized format: {file_path}")
        continue
    
    print(f"Processing {year} Q{quarter} (period_start: {period_date.date()})...")
    
    try:
        # Read holdings data
        df = pd.read_parquet(file_path)
        
        # Rename to required format
        df = df.rename(columns={
            "cik": "CIK",
            "cusip": "CUSIP",
            "sshprnamt": "SSHPRNAMT",
            "period_start": "PERIOD_DATE"
        })
        
        # Type normalization
        df["CIK"] = df["CIK"].astype(str)
        df["CUSIP"] = df["CUSIP"].astype(str)
        df["PERIOD_DATE"] = pd.to_datetime(df["PERIOD_DATE"])
        
        # Join: CUSIP → ticker
        df = df.merge(
            ticker_map[["cusip", "ticker"]],
            left_on="CUSIP",
            right_on="cusip",
            how="left"
        ).drop(columns=["cusip"])
        
        # Join: ticker + period → price
        df = df.merge(
            prices[["ticker", "period_start", "price"]],
            left_on=["ticker", "PERIOD_DATE"],
            right_on=["ticker", "period_start"],
            how="left"
        ).drop(columns=["period_start"])
        
        # Compute VALUE
        df["VALUE"] = df["SSHPRNAMT"] * df["price"]
        
        # Add metadata columns
        df["YEAR"] = year
        df["QUARTER"] = f"Q{quarter}_{year}"
        
        # Final column order
        df = df[["CIK", "CUSIP", "VALUE", "SSHPRNAMT", "PERIOD_DATE", "YEAR", "QUARTER"]]
        
        # Save individual parquet file
        out_path = os.path.join(
            output_dir,
            f"holdings_processed_Q{quarter}_{year}.parquet"
        )
        df.to_parquet(out_path, index=False)
        # print(df.head(10))
        print(f"  ✓ Saved {len(df)} records to {out_path}")
        
        # Add to list for combined file
        all_dfs.append(df)
        
    except Exception as e:
        print(f"  ✗ Error processing {file_path}: {e}")
        continue



Found 49 holdings files
Processing 2013 Q2 (period_start: 2013-04-01)...
          CIK      CUSIP         VALUE  SSHPRNAMT PERIOD_DATE  YEAR  QUARTER
0  0001011659  000360206  6.487250e+02       70.0  2013-04-01  2013  Q2_2013
1  0001564702  000360206  1.723570e+05    18598.0  2013-04-01  2013  Q2_2013
2  0001374170  000360206  1.611943e+06   173935.0  2013-04-01  2013  Q2_2013
3  0001121914  000360206  7.898690e+04     8523.0  2013-04-01  2013  Q2_2013
4  0000049205  000360206  3.228797e+04     3484.0  2013-04-01  2013  Q2_2013
5  0001163744  000360206  4.649041e+06   501650.0  2013-04-01  2013  Q2_2013
6  0001374170  000361105  7.380215e+06   347167.0  2013-04-01  2013  Q2_2013
7  0001593051  000361105  2.072694e+06    97500.0  2013-04-01  2013  Q2_2013
8  0001011659  00081T108  3.430474e+03      752.0  2013-04-01  2013  Q2_2013
9  0001564702  00081T108  5.013874e+04    10991.0  2013-04-01  2013  Q2_2013
  ✓ Saved 19659 records to /home/zenoua/Social-Network-Stock-Market/Social Netwo

In [None]:

dfs = []
for i, file in enumerate(file_paths):
    df = pd.read_csv(file)
    df['QUARTER'] = f'Q{i+1}_2018'
    dfs.append(df)
data = pd.concat(dfs, ignore_index=True)
data = data[['CIK', 'CUSIP', 'VALUE', 'SSHPRNAMT', 'PERIOD_DATE', 'QUARTER']]
data['PERIOD_DATE'] = pd.to_datetime(data['PERIOD_DATE'])
data = data.dropna(subset=['CIK', 'CUSIP', 'VALUE'])
data['CIK'] = data['CIK'].astype(str)
data['CUSIP'] = data['CUSIP'].astype(str)
data = data.sort_values(by='PERIOD_DATE')
print(f'Total records after cleaning: {len(data)}')

Total records after cleaning: 3996


## 3. Graph Construction
Build bipartite and projected graphs for funds and stocks.

In [9]:
def build_graph_and_features_up_to(max_date):
    df_up_to = data[data['PERIOD_DATE'] <= max_date].copy()
    
    funds_up_to = df_up_to['CIK'].unique()
    stocks_up_to = df_up_to['CUSIP'].unique()
    
    G_bip = nx.Graph()
    G_bip.add_nodes_from(funds_up_to, bipartite=0)
    G_bip.add_nodes_from(stocks_up_to, bipartite=1)
    
    edges = [(row.CIK, row.CUSIP, 
            {'value': row.VALUE, 'amount': row.SSHPRNAMT, 'time': row.PERIOD_DATE})
            for row in df_up_to.itertuples(index=False)]
    G_bip.add_edges_from(edges)
        
    # Fund-Fund projection with weights (weighted by shared stocks)
    G_fund = bipartite.weighted_projected_graph(G_bip, funds_up_to)
    
    # Convert to directed based on time, only for existing edges
    G_fund_directed = nx.DiGraph()
    for u, v, data_dict in G_fund.edges(data=True):
        shared = set(G_bip.neighbors(u)) & set(G_bip.neighbors(v))
        if not shared:
            continue
        
        times_u = [G_bip.edges[u,s]['time'] for s in shared]
        times_v = [G_bip.edges[v,s]['time'] for s in shared]
        avg_u = np.mean([t.timestamp() for t in times_u])
        avg_v = np.mean([t.timestamp() for t in times_v])
        
        weight = data_dict.get('weight', 1)
        
        if avg_u < avg_v:
            G_fund_directed.add_edge(u, v, weight=weight)
        else:
            G_fund_directed.add_edge(v, u, weight=weight)
    
    G_fund = G_fund_directed  # Replace with directed version
    
    # Topological features (על G_fund המכוון)
    degree_cent = degree_centrality(G_fund)
    pagerank_cent = nx.pagerank(G_fund)
    hubs, authorities = hits(G_fund)
    largest_cc = max(nx.connected_components(G_fund.to_undirected()), key=len)
    closeness_cent = closeness_centrality(G_fund.to_undirected().subgraph(largest_cc))
    
    fund_features = pd.DataFrame({
        'fund': list(G_fund.nodes()),
        'degree': [degree_cent.get(n, 0) for n in G_fund.nodes()],
        'pagerank': [pagerank_cent.get(n, 0) for n in G_fund.nodes()],
        'hub': [hubs.get(n, 0) for n in G_fund.nodes()],
        'authority': [authorities.get(n, 0) for n in G_fund.nodes()],
        'closeness': [closeness_cent.get(n, 0) for n in G_fund.nodes()]
    }).set_index('fund')
    
    # Community (Leiden)
    vertex_names = list(G_fund.nodes())
    vertex_to_idx = {v: i for i, v in enumerate(vertex_names)}

    # המר edges לאינדקסים
    edge_list = [(vertex_to_idx[u], vertex_to_idx[v]) 
                for u, v in G_fund.to_undirected().edges()]

    ig_G = ig.Graph(n=len(vertex_names), edges=edge_list)
    ig_G.vs['_nx_name'] = vertex_names
    partition = la.find_partition(ig_G, la.ModularityVertexPartition)
    communities = {ig_G.vs[i]['_nx_name']: p for p, cl in enumerate(partition) for i in cl}
    fund_features['community'] = fund_features.index.map(communities).fillna(-1)
    
    return G_bip, G_fund, fund_features, df_up_to, funds_up_to, stocks_up_to

In [10]:
# Build full graph and features for all data (for prediction on any fund)
full_max_date = data['PERIOD_DATE'].max()
G_full, G_fund_full, fund_features_full, df_full, funds_full, stocks_full = build_graph_and_features_up_to(full_max_date)
fund_idx_full = {f: i for i, f in enumerate(funds_full)}
stock_idx_full = {s: i for i, s in enumerate(stocks_full)}
# For Q4-only funds (unbiased prediction)
q4_min_date = data[data['QUARTER'] == 'Q4_2018']['PERIOD_DATE'].min()
q4_max_date = data[data['QUARTER'] == 'Q4_2018']['PERIOD_DATE'].max()
funds_q4 = set(data[(data['PERIOD_DATE'] >= q4_min_date) & (data['PERIOD_DATE'] <= q4_max_date)]['CIK'].unique())

## 4. Training Phase (Up to Q3)
Split the data temporally and prepare for model training.

### 

In [11]:
# Use only up to Q2 for training (temporal split)
train_max_date = pd.to_datetime('2018-06-30')  # End of Q2
G_bip_train, G_fund_train, fund_features_train, df_train, funds_train, stocks_train = build_graph_and_features_up_to(train_max_date)
fund_idx_train = {f: i for i, f in enumerate(funds_train)}
stock_idx_train = {s: i for i, s in enumerate(stocks_train)}
print(f"Bipartite Graph: {G_bip_train.number_of_edges()} edges")
print(f"Fund-Fund Graph: {G_fund_train.number_of_edges()} edges")

Bipartite Graph: 2850 edges
Fund-Fund Graph: 36 edges


## 5. GraphSAGE Embedding Training
Train GraphSAGE on the training graph to generate node embeddings.

In [12]:
# Node features - use simple random vectors (can be improved)
num_nodes = len(funds_train) + len(stocks_train)

# Try to move to GPU, fallback to CPU if error
try:
    x = torch.randn(num_nodes, 16).to(device)
    print(f"Node features initialized on {device}")
except Exception as e:
    print(f"Error moving to GPU, using CPU: {e}")
    device = torch.device('cpu')
    x = torch.randn(num_nodes, 16).to(device)

# Map indices for homogeneous graph (train only)
edge_index_homo = []
for u, v in G_bip_train.edges():
    u_idx = fund_idx_train[u] if u in fund_idx_train else stock_idx_train.get(u, -1)
    v_idx = fund_idx_train[v] if v in fund_idx_train else stock_idx_train.get(v, -1)
    if u_idx != -1 and v_idx != -1:
        edge_index_homo.append([u_idx, v_idx])
        edge_index_homo.append([v_idx, u_idx])  # undirected

try:
    edge_index_homo = torch.tensor(edge_index_homo, dtype=torch.long).t().contiguous().to(device)
    print(f"Edge index initialized on {device}")
except Exception as e:
    print(f"Error moving edge index to GPU, using CPU: {e}")
    device = torch.device('cpu')
    x = x.cpu()
    edge_index_homo = torch.tensor(edge_index_homo, dtype=torch.long).t().contiguous().to(device)

# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

try:
    model = GraphSAGE(16, 32, 8).to(device)
    print(f"Model initialized on {device}")
except Exception as e:
    print(f"Error moving model to GPU, using CPU: {e}")
    device = torch.device('cpu')
    x = x.cpu()
    edge_index_homo = edge_index_homo.cpu()
    model = GraphSAGE(16, 32, 8).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train GraphSAGE with early stopping
model.train()
prev_loss = float('inf')
patience = 5
no_improve = 0

print(f"Starting training on {device}...")
for epoch in range(50):
    try:
        optimizer.zero_grad()
        out = model(x, edge_index_homo)
        # Loss on existing edges (link prediction style)
        pos_score = (out[edge_index_homo[0]] * out[edge_index_homo[1]]).sum(dim=1).sigmoid()
        loss = -torch.log(pos_score + 1e-15).mean()  # add epsilon to avoid log(0)
        loss.backward()
        optimizer.step()
        
        # Early stopping logic
        if abs(prev_loss - loss.item()) < 1e-6:
            no_improve += 1
            if no_improve >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
        else:
            no_improve = 0
        prev_loss = loss.item()
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    except RuntimeError as e:
        if 'CUDA' in str(e):
            print(f"CUDA error during training, switching to CPU: {e}")
            device = torch.device('cpu')
            x = x.cpu()
            edge_index_homo = edge_index_homo.cpu()
            model = model.cpu()
            optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
            print("Restarting training on CPU...")
            epoch = 0
            prev_loss = float('inf')
            no_improve = 0
            continue
        else:
            raise

# Extract embeddings and move to CPU for further processing
with torch.no_grad():
    emb = model(x, edge_index_homo).cpu().numpy()

# Split embeddings for funds and stocks
dynamic_emb_train = emb[:len(funds_train)]      # shape: [len(funds_train), 8]
stock_emb_train = emb[len(funds_train):]        # shape: [len(stocks_train), 8]

print(f"GraphSAGE training completed on {device}.")
print(f"dynamic_emb_train shape: {dynamic_emb_train.shape}")
print(f"stock_emb_train shape: {stock_emb_train.shape}")

Node features initialized on cpu
Edge index initialized on cpu
Model initialized on cpu
Starting training on cpu...


Epoch 0, Loss: 0.3611
Epoch 10, Loss: 0.0000
Early stopping at epoch 11
GraphSAGE training completed on cpu.
dynamic_emb_train shape: (9, 8)
stock_emb_train shape: (1891, 8)


## 6. Save Training Artifacts
Save embeddings, features, and model for later use.

In [13]:
# ── Cache trained artifacts (run ONCE) ───────────────────────────────────────
GRAPH_READY = True
_cached_graph = G_bip_train
_cached_embeddings = stock_emb_train
_cached_model = model.cpu()  # Move model to CPU for caching


In [14]:
# Save embeddings, model, and graph to files for later use
import joblib
import pickle
import os

# Create a directory for artifacts if it doesn't exist
os.makedirs('artifacts', exist_ok=True)

# Save stock embeddings
np.save('artifacts/stock_emb_train.npy', stock_emb_train)

# Save dynamic fund embeddings
np.save('artifacts/dynamic_emb_train.npy', dynamic_emb_train)

# Save fund features
fund_features_train.to_pickle('artifacts/fund_features_train.pkl')

# Save LightGBM model (bst) if it exists
if 'bst' in globals():
    joblib.dump(bst, 'artifacts/lightgbm_bst.pkl')

# Save funds and stocks lists
with open('artifacts/funds_train.pkl', 'wb') as f:
    pickle.dump(funds_train, f)
with open('artifacts/stocks_train.pkl', 'wb') as f:
    pickle.dump(stocks_train, f)

print('Artifacts saved to artifacts/ directory.')

Artifacts saved to artifacts/ directory.


## 7. Test Phase (Q4)
Prepare test data and features for evaluation.

In [15]:
# Test Phase (Q3) - Using only funds and stocks seen in train (Q1-Q2)
test_min_date = data[data['QUARTER'] == 'Q3_2018']['PERIOD_DATE'].min()
test_max_date = data[data['QUARTER'] == 'Q3_2018']['PERIOD_DATE'].max()
test_data = data[(data['PERIOD_DATE'] >= test_min_date) & (data['PERIOD_DATE'] <= test_max_date)]
# Only use funds and stocks that were seen in train
funds_test = set(test_data['CIK'].unique()) & set(funds_train)
stocks_test = set(test_data['CUSIP'].unique()) & set(stocks_train)
# positive edges: רק Q3, אבל רק עבור funds/stocks שמופיעים ב-train
pos_edges_test = []
for row in test_data.itertuples(index=False):
    cik = row.CIK
    cusip = row.CUSIP
    if cik in funds_test and cusip in stocks_test:
        pos_edges_test.append((fund_idx_train[cik], stock_idx_train[cusip], 1))
# negative sampling: רק על embeddings של train (שכבר חושבו ב-Cell 4!)
stock_sim_train = cosine_similarity(stock_emb_train)
fund_to_connected = {f_idx: [stock_idx_train[s] for s in G_bip_train.neighbors(fund) if s in stocks_test and s in stock_idx_train]
                     for f_idx, fund in enumerate(funds_train) if fund in funds_test}
neg_edges_test = []
for f_idx in range(len(funds_train)):
    if funds_train[f_idx] not in funds_test:
        continue
    connected = fund_to_connected.get(f_idx, [])
    if not connected:
        continue
    sim_scores = stock_sim_train[connected].mean(axis=0)
    hard_negs = np.argsort(-sim_scores)[len(connected):len(connected)+50]
    neg_edges_test.extend([(f_idx, neg_idx, 0) for neg_idx in hard_negs if neg_idx not in connected and neg_idx < len(stocks_train) and stocks_train[neg_idx] in stocks_test])
neg_edges_test = neg_edges_test[:len(pos_edges_test)]
# Link features
link_data = []
for f_i, s_i, label in pos_edges_test + neg_edges_test:
    fund_id = funds_train[f_i]
    feat = np.concatenate([
        dynamic_emb_train[f_i],
        stock_emb_train[s_i],
        fund_features_train.loc[fund_id, ['degree', 'pagerank', 'hub', 'authority', 'closeness', 'community']].values
    ])
    link_data.append((feat, label))
X = np.array([d[0] for d in link_data])
y = np.array([d[1] for d in link_data])
print(f"Created {len(pos_edges_test)} positive and {len(neg_edges_test)} negative test samples")
print(f"Test set positive ratio: {np.mean(y):.3f} (1=positive, 0=negative)")

Created 3803 positive and 357 negative test samples
Test set positive ratio: 0.914 (1=positive, 0=negative)


## 8. LightGBM Training & Evaluation
Train and evaluate the LightGBM model.

In [16]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import precision_score
import lightgbm as lgb
import numpy as np
import joblib
import os

# LightGBM parameters (define before use)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'verbose': -1
}

# Train/test/final model only ONCE!
if 'bst' not in globals():
    tscv = TimeSeriesSplit(n_splits=5)  # 5 folds, expanding train set each time
    cv_aucs = cross_val_score(lgb.LGBMClassifier(**params), X, y, cv=tscv, scoring='roc_auc')
    print(f"CV AUC mean: {cv_aucs.mean():.4f} (±{cv_aucs.std():.4f})")
    precisions = []
    for train_idx, test_idx in tscv.split(X):
        X_train_cv, X_test_cv = X[train_idx], X[test_idx]
        y_train_cv, y_test_cv = y[train_idx], y[test_idx]
        train_data_cv = lgb.Dataset(X_train_cv, label=y_train_cv)
        test_data_cv = lgb.Dataset(X_test_cv, label=y_test_cv, reference=train_data_cv)
        bst_cv = lgb.train(params, train_data_cv, num_boost_round=100, valid_sets=[test_data_cv])
        y_pred_cv = bst_cv.predict(X_test_cv)
        precision_cv = precision_score(y_test_cv, (y_pred_cv > 0.5).astype(int))
        precisions.append(precision_cv)
    print(f"CV Precision mean: {np.mean(precisions):.4f} (±{np.std(precisions):.4f})")
    train_data = lgb.Dataset(X, label=y)
    bst = lgb.train(params, train_data, num_boost_round=100)
    # --- Save model to artifacts folder ---
    os.makedirs('artifacts', exist_ok=True)
    joblib.dump(bst, os.path.join('artifacts', 'lightgbm_bst.pkl'))
    print('Final LightGBM model trained and saved to artifacts/lightgbm_bst.pkl.')
else:
    print('LightGBM model already trained and loaded in memory. Skipping retraining.')

CV AUC mean: nan (±nan)
CV Precision mean: 0.8970 (±0.2061)
Final LightGBM model trained and saved to artifacts/lightgbm_bst.pkl.


## 9. Prediction Function
Function to predict top stocks for a given fund.

In [17]:
def predict_portfolio(fund_id):
    """
    Efficiently predict top stocks for a given fund_id using precomputed artifacts.
    Assumes all heavy computations (embeddings, features, model) are preloaded and cached.
    """
    global bst, dynamic_emb_train, stock_emb_train, fund_features_train, funds_train, stocks_train
    if 'bst' not in globals() or 'dynamic_emb_train' not in globals() or 'stock_emb_train' not in globals():
        raise RuntimeError("Artifacts not loaded. Please run the training cells or load artifacts.")
    fund_idx = {f: i for i, f in enumerate(funds_train)}
    stock_list = stocks_train
    if fund_id not in fund_idx:
        print(f"Fund with CIK {fund_id} not found in the data.")
        return []
    f_idx = fund_idx[fund_id]
    fund_id_str = funds_train[f_idx]
    fund_emb_repeat = np.tile(dynamic_emb_train[f_idx], (len(stock_list), 1))
    fund_topo_repeat = np.tile(fund_features_train.loc[fund_id_str, ['degree', 'pagerank', 'hub', 'authority', 'closeness', 'community']].values, (len(stock_list), 1))
    feats = np.concatenate([fund_emb_repeat, stock_emb_train, fund_topo_repeat], axis=1)
    preds = bst.predict(feats)
    stock_preds = list(zip(stock_list, preds))
    top_stocks = sorted(stock_preds, key=lambda x: x[1], reverse=True)[:5]
    return top_stocks

In [18]:
# Load all artifacts (embeddings, features, model) for fast prediction
# (imports moved to the first cell)

artifacts_path = 'artifacts'

def load_artifacts():
    global stock_emb_train, dynamic_emb_train, fund_features_train, bst, funds_train, stocks_train
    if os.path.exists(artifacts_path):
        try:
            stock_emb_train = np.load(os.path.join(artifacts_path, 'stock_emb_train.npy'))
            dynamic_emb_train = np.load(os.path.join(artifacts_path, 'dynamic_emb_train.npy'))
            fund_features_train = pd.read_pickle(os.path.join(artifacts_path, 'fund_features_train.pkl'))
            bst = joblib.load(os.path.join(artifacts_path, 'lightgbm_bst.pkl'))
            with open(os.path.join(artifacts_path, 'funds_train.pkl'), 'rb') as f:
                funds_train = pickle.load(f)
            with open(os.path.join(artifacts_path, 'stocks_train.pkl'), 'rb') as f:
                stocks_train = pickle.load(f)
            print('Artifacts loaded successfully. You can now run predictions without retraining.')
        except Exception as e:
            print('Failed to load artifacts:', e)
    else:
        print('Artifacts directory not found. Please run the training cells first.')

# Load artifacts at notebook startup for fast prediction
load_artifacts()

Artifacts loaded successfully. You can now run predictions without retraining.


## 10. Predict for a Specific Fund
Example: Predict for a random out-of-sample fund.

> **Note:**
>
> - To make an unbiased next-quarter prediction, you must use only funds that were seen in training (Q1-Q2) and predict their holdings in Q4.
> - Predicting for funds that were not seen in training is not possible (no embeddings/features for them).
> - Predicting for funds using their Q4 data in training or feature construction would cause data leakage and bias.
> - The code below ensures you only predict for eligible funds, with no leakage.


## 10.1. List Eligible Funds for Next-Quarter Prediction
Print all funds that were seen in training (Q1-Q2) and also appear in Q4. These are the only funds for which you can make an unbiased next-quarter prediction (no data leakage).

In [19]:
# List funds eligible for next-quarter (Q4) prediction: must be seen in train (Q1-Q2) and appear in Q4
if 'funds_train' in globals() and 'funds_q4' in globals():
    eligible_funds = sorted(list(set(funds_train) & set(funds_q4)))
    print(f"Number of eligible funds for Q4 prediction: {len(eligible_funds)}")
    print("Sample of eligible funds (first 20):")
    print(eligible_funds[:20])
    # You can select any fund from this list for next-quarter prediction (no leakage, no bias)
else:
    print('Required fund lists not found. Please ensure all previous cells have been run.')

Number of eligible funds for Q4 prediction: 9
Sample of eligible funds (first 20):
['1021249', '1325091', '1345929', '1424116', '1535839', '1576102', '1623678', '1694461', '1697457']


## 10.1. Random/Specific Fund Next-Quarter Prediction
To choode specific FUND from the list, update the var *fund_id_to_predict*

In [20]:
# Example: Predict for a specific or random eligible fund (seen in train, predict Q4)
fund_id_to_predict = '1535839'  # Set to a specific CIK string to choose a fund, or leave as None for random
fund_list_for_prediction = eligible_funds if 'eligible_funds' in globals() and len(eligible_funds) > 0 else []
if len(fund_list_for_prediction) > 0:
    if fund_id_to_predict is not None and str(fund_id_to_predict) in fund_list_for_prediction:
        selected_fund = str(fund_id_to_predict)
        print(f'Selected eligible fund for Q4 prediction: {selected_fund}')
    else:
        selected_fund = random.choice(fund_list_for_prediction)
        print(f'Randomly selected eligible fund for Q4 prediction: {selected_fund}')
    top_stocks = predict_portfolio(selected_fund)
    print('Top recommended stocks for Q4:')
    for stock, score in top_stocks:
        print(f'  {stock}: {score:.4f}')
else:
    print('No eligible funds available for prediction. Please check your data and artifacts.')

Selected eligible fund for Q4 prediction: 1535839
Top recommended stocks for Q4:
  880349105: 0.9561
  67555N206: 0.9561
  67420T206: 0.9561
  85570W100: 0.9561
  983134107: 0.9561
