In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

def create_saved_dataset(input_csv, output_tfrecords):
    df = pd.read_csv(input_csv)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    bins = np.linspace(0.005 / 25, 0.005, 25)
    df['target'] = np.digitize(df['target'], bins).astype(np.int64)

    df['weekday'] = df['date'].dt.weekday
    df['hour'] = df['date'].dt.hour
    df['month'] = df['date'].dt.month

    hour_type_columns = [col for col in df.columns if 'hourType' in col]

    exclude_columns = ['date', 'target', 'weekday', 'hour', 'month'] + hour_type_columns
    instrument_columns = [col for col in df.columns if col not in exclude_columns]

    df[instrument_columns] = df[instrument_columns].fillna(method='ffill').fillna(0)

    if hour_type_columns:
        df[hour_type_columns] = df[hour_type_columns].astype(str)
    else:
        df['hourType'] = 'unknown'  

    # Write to TFRecord
    with tf.io.TFRecordWriter(output_tfrecords) as writer:
        for _, row in df.iterrows():
            feature = {
                'tickers': tf.train.Feature(float_list=tf.train.FloatList(value=row[instrument_columns].values.astype(np.float32))),
                'hourType': tf.train.Feature(bytes_list=tf.train.BytesList(value=[row[col].encode() for col in hour_type_columns])),
                'weekday': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['weekday']])),
                'hour': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['hour']])),
                'month': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['month']])),
                'target': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['target']])),
            }
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())

if __name__ == "__main__":
    create_saved_dataset("gradAppML-assignment1-dataset.csv", "dataset.tfrecord")


2025-03-04 22:38:52.944084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741145933.034505   16647 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741145933.060270   16647 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 22:38:53.249422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  df[instrument_columns] = df[instrument_columns].fillna(method='ffill').fillna(0)
