In [1]:
import os
import glob
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp

import tensorflow as tf
import pandas as pd
import numpy as np

# ────────────────────────────────────────────────────────────────────────────────
# Configuration
# ────────────────────────────────────────────────────────────────────────────────
CONFIG = {
    'data_dir':
    os.environ.get('DATA_DIR', '/gpfs/home/zh283/StockPredictionDNN/Data'),
    'variant':
    'pct',  # options: raw, pct, z, invn
    'label':
    'ret_invn',  # options: ret_exc_lead1m, ret_pct, ret_z, ret_invn
    'weight':
    'w_ew',  # options: w_ew, w_vw
    'feature_dim':
    153,

    # Year splits (inclusive)
    'train_years':
    list(range(1971, 1997)),
    'val_years':
    list(range(1998, 2002)),
    'predict_years':
    list(range(2003, 2024)),

    # Model hyperparameters
    'num_layers':
    5,
    'neurons_per_layer':
    64,
    'dropout':
    0.1,
    'use_batch_norm':
    True,
    'loss':
    'mse',  # options: mse, mae, huber
    'huber_delta':
    1.0,
    'learning_rate':
    1e-3,
    'lr_scheduler':
    'reduce_on_plateau',  # options: none, exponential_decay, reduce_on_plateau
    'decay_steps':
    10000,
    'decay_rate':
    0.96,
    'reduce_patience':
    5,
    'reduce_factor':
    0.5,

    # Training parameters
    'train_batch_size':
    1024,
    'eval_batch_size':
    32976,
    'epochs':
    50,
    'ensemble_size':
    10,
}
CONFIG['tfrecord_dir'] = os.path.join(CONFIG['data_dir'], 'tfrecords',
                                      CONFIG['variant'])

os.makedirs(CONFIG.get('model_dir', './chkpts_script'), exist_ok=True)

# Force safe start‑method once, **before** TF sees CUDA
mp.set_start_method("spawn", force=True)

# ────────────────────────────────────────────────────────────────────────────────
# GPU setup
# ────────────────────────────────────────────────────────────────────────────────
PHYSICAL_GPUS = tf.config.list_physical_devices('GPU')
# Enable memory growth so each process only allocates as needed
for gpu in PHYSICAL_GPUS:
    tf.config.experimental.set_memory_growth(gpu, True)
NUM_GPUS = len(PHYSICAL_GPUS)

# ────────────────────────────────────────────────────────────────────────────────
# Dataset utilities
# ────────────────────────────────────────────────────────────────────────────────


def parse_example_fn(feature_dim, mode='train'):
    desc = {
        'feat': tf.io.FixedLenFeature([feature_dim], tf.float32),
        CONFIG['label']: tf.io.FixedLenFeature([], tf.float32),
        CONFIG['weight']: tf.io.FixedLenFeature([], tf.float32),
    }
    if mode == 'predict':
        desc.update({
            'permno': tf.io.FixedLenFeature([], tf.int64),
            'eom': tf.io.FixedLenFeature([], tf.int64),
            'me': tf.io.FixedLenFeature([], tf.float32),
            'size_grp': tf.io.FixedLenFeature([], tf.string),
            'crsp_exchcd': tf.io.FixedLenFeature([], tf.int64),
            'ret': tf.io.FixedLenFeature([], tf.float32),
            'ret_exc': tf.io.FixedLenFeature([], tf.float32),
        })

    def _parse(record):
        ex = tf.io.parse_single_example(record, desc)
        features = ex.pop('feat')
        label = ex.pop(CONFIG['label'])
        weight = ex.pop(CONFIG['weight'])
        if mode == 'predict':
            return features, label, weight, ex
        return features, label, weight

    return _parse


# ────────────────────────────────────────────────────────────────────────────────
# Dataset creation
# ────────────────────────────────────────────────────────────────────────────────
def make_dataset(years, mode='train'):
    # Build list of matching TFRecord files
    pattern = os.path.join(CONFIG['tfrecord_dir'], CONFIG['variant'],
                           f"{CONFIG['variant']}-year*.tfrecord")
    files = sorted(
        f for f in glob.glob(pattern)
        if int(os.path.basename(f).split('year')[1].split('.')[0]) in years)

    # Use from_generator to avoid early tf.constant (prevents CUDA init errors)
    files_ds = tf.data.Dataset.from_generator(lambda: files,
                                              output_types=tf.string,
                                              output_shapes=())

    if mode == 'train' and len(files) > 1:
        files_ds = files_ds.shuffle(len(files))
        cycle_length = min(16, len(files))
    else:
        cycle_length = 1

    # Parallel interleave reading TFRecords
    ds = files_ds.interleave(tf.data.TFRecordDataset,
                             cycle_length=cycle_length,
                             num_parallel_calls=tf.data.AUTOTUNE)

    parse_fn = parse_example_fn(CONFIG['feature_dim'], mode)
    ds = ds.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.cache()
    if mode == 'train':
        ds = ds.shuffle(10000)

    batch_size = CONFIG['train_batch_size'] if mode == 'train' else CONFIG[
        'eval_batch_size']
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


# ────────────────────────────────────────────────────────────────────────────────
# Model definition
# ────────────────────────────────────────────────────────────────────────────────
LOSS_MAP = {
    'mse': tf.keras.losses.MeanSquaredError(),
    'mae': tf.keras.losses.MeanAbsoluteError(),
    'huber': tf.keras.losses.Huber(delta=CONFIG['huber_delta']),
}
LR_SCHEDULES = {
    'none':
    lambda: CONFIG['learning_rate'],
    'exponential_decay':
    lambda: tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=CONFIG['learning_rate'],
        decay_steps=CONFIG['decay_steps'],
        decay_rate=CONFIG['decay_rate'],
        staircase=True),
}


def train_single(idx: int):
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        tf.config.set_visible_devices(gpus[idx % len(gpus)], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[idx % len(gpus)], True)

    train_ds = make_dataset(CONFIG['train_years'], 'train')
    val_ds = make_dataset(CONFIG['val_years'], 'val')

    mdl = build_model()
    cbs = [
        tf.keras.callbacks.EarlyStopping('val_loss',
                                         patience=5,
                                         restore_best_weights=True)
    ]
    if CONFIG['lr_scheduler'] == 'reduce_on_plateau':
        cbs.append(
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                patience=CONFIG['reduce_patience'],
                factor=CONFIG['reduce_factor'],
                verbose=1))

    hist = mdl.fit(train_ds,
                   validation_data=val_ds,
                   epochs=CONFIG['epochs'],
                   callbacks=cbs,
                   verbose=2)

    path = os.path.join(CONFIG['model_dir'], f'ensemble_{idx}.keras')
    mdl.save(path, save_format='keras_v3')
    return path, hist.history


# ────────────────────────────────────────────────────────────────────────────────
# Single-model training function (spawn-safe)
# ────────────────────────────────────────────────────────────────────────────────
def train_single(idx: int):
    # isolate one GPU
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        tf.config.set_visible_devices(gpus[idx % len(gpus)], 'GPU')
        for g in gpus:
            tf.config.experimental.set_memory_growth(g, True)

    train_ds = make_dataset(CONFIG['train_years'], 'train')
    val_ds = make_dataset(CONFIG['val_years'], 'val')

    model = build_model()
    callbacks = [
        tf.keras.callbacks.EarlyStopping('val_loss',
                                         patience=5,
                                         restore_best_weights=True)
    ]
    if CONFIG['lr_scheduler'] == 'reduce_on_plateau':
        callbacks.append(make_lr_callback())

    hist = model.fit(train_ds,
                     validation_data=val_ds,
                     epochs=CONFIG['epochs'],
                     verbose=2,
                     callbacks=callbacks)

    mdl_path = os.path.join(CONFIG['model_dir'], f'ensemble_{idx}.keras')
    model.save(mdl_path, save_format='keras_v3')  # light, TF‑native format
    return mdl_path, hist.history  # both are trivially picklable


# --------------------------------------------------------------------------- #
# 5. Main
# --------------------------------------------------------------------------- #
def main():
    workers = CONFIG['ensemble_size']
    with ProcessPoolExecutor(max_workers=workers) as ex:
        futures = [ex.submit(train_single, i) for i in range(workers)]
        results = [f.result() for f in futures]  # wait & collect

    model_paths, histories = zip(*results)
    ensemble = [tf.keras.models.load_model(p) for p in model_paths]

    pred_ds = make_dataset(CONFIG['predict_years'], 'predict')
    rows = []
    for feats, _, _, meta in pred_ds:
        preds = tf.reduce_mean(tf.stack(
            [m(feats, training=False) for m in ensemble], axis=0),
                               axis=0).numpy().ravel()
        meta_np = {k: v.numpy() for k, v in meta.items()}
        for i, p in enumerate(preds):
            rec = {k: meta_np[k][i] for k in meta_np}
            rec['prediction'] = float(p)
            rows.append(rec)

    out_df = pd.DataFrame(rows)
    out_df.to_parquet('ensemble_predictions.parquet', index=False)
    print(out_df.head())
    print("\nHistories saved for each worker.")


if __name__ == "__main__":
    main()

2025-05-06 06:11:02.090570: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746526262.108156 1678293 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746526262.114092 1678293 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746526262.128410 1678293 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746526262.128424 1678293 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746526262.128426 1678293 computation_placer.cc:177] computation placer alr

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.