In [1]:
import os
import shutil
import glob
import pandas as pd
import tensorflow as tf
from multiprocessing import Pool, cpu_count

# Paths (single source of truth)
DATA_DIR = os.environ.get('DATA_DIR',
                          '/gpfs/home/zh283/StockPredictionDNN/Data')
PARQUET_DIR = os.path.join(DATA_DIR, 'parquet')
FACTOR_XLSX = os.path.join(DATA_DIR, 'factors_list.xlsx')
TFRECORD_DIR = os.path.join(DATA_DIR, 'tfrecords')

# Cleanup
if os.path.exists(TFRECORD_DIR):
    shutil.rmtree(TFRECORD_DIR)
os.makedirs(TFRECORD_DIR, exist_ok=True)

# Feature & schema
chars = pd.read_excel(FACTOR_XLSX)
FEATURE_COLS = chars.loc[chars['abr_jkp'].notna(), 'abr_jkp'].tolist()
META_COLS = [
    'permno', 'eom', 'me', 'size_grp', 'crsp_exchcd', 'ret', 'ret_exc'
]
WEIGHT_COLS = ['w_ew', 'w_vw']
LABEL_COLS = ['ret_exc_lead1m', 'ret_pct', 'ret_z', 'ret_invn']


def _float_feature(v):
    return tf.train.Feature(float_list=tf.train.FloatList(value=v))


def _int64_feature(v):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=v))


def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def serialize_example(rec):
    features = {}
    # Pack vector
    features['feat'] = _float_feature([rec[c] for c in FEATURE_COLS])
    # Pack scalars
    for col in META_COLS + WEIGHT_COLS + LABEL_COLS:
        val = rec[col]
        if isinstance(val, str):
            # e.g. 'size_grp' might be a categorical string
            features[col] = _bytes_feature(val.encode('utf-8'))

        elif isinstance(val, int):
            features[col] = _int64_feature([val])
        else:
            features[col] = _float_feature([float(val)])
    return tf.train.Example(features=tf.train.Features(feature=features)) \
               .SerializeToString()


def write_variant_year(args):
    parquet_file, variant, out_dir = args
    # derive year from folder name "year=YYYY"
    year = os.path.basename(os.path.dirname(parquet_file)).split('=')[1]
    shard_path = os.path.join(out_dir, f"{variant}-year{year}.tfrecord")
    writer = tf.io.TFRecordWriter(shard_path)
    df = pd.read_parquet(parquet_file)
    for _, row in df.iterrows():
        writer.write(serialize_example(row.to_dict()))
    writer.close()

if __name__ == '__main__':
    variants = ['raw', 'pct', 'z', 'invn']
    tasks = []

    # Create one subfolder per variant
    for variant in variants:
        variant_out = os.path.join(TFRECORD_DIR, variant)
        os.makedirs(variant_out, exist_ok=True)

        pattern = os.path.join(PARQUET_DIR, variant, 'year=*', '*.parquet')
        for pf in glob.glob(pattern, recursive=True):
            tasks.append((pf, variant, variant_out))

    # Use up to one worker per task or per core
    workers = min(len(tasks), cpu_count())
    with Pool(processes=workers) as pool:
        pool.map(write_variant_year, tasks)

2025-05-06 07:39:36.222915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746531576.240507 1686425 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746531576.246176 1686425 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746531576.264015 1686425 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746531576.264058 1686425 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746531576.264060 1686425 computation_placer.cc:177] computation placer alr