In [41]:
# -----------------------------
# 1. Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from dotenv import load_dotenv
import os


In [42]:
#LOAD ENV
current_dir = os.getcwd()
project_root = current_dir

while not os.path.isfile(os.path.join(project_root, ".env")):
    # Go one directory up
    parent_dir = os.path.abspath(os.path.join(project_root, ".."))
    if parent_dir == project_root:
        raise FileNotFoundError(".env file not found in any parent directory")
    project_root = parent_dir

# Step 2: Load .env from project root
load_dotenv(os.path.join(project_root, ".env"))


True

In [None]:

# -----------------------------
# 2. Load Cleaned Data
# -----------------------------
df = pd.read_parquet(os.getenv("DATA_DIR_PROCESSED") + "/USD_ZAR_2025_07.parquet")
df['date_time'] = pd.to_datetime(df['date_time'])
df = df.sort_values('date_time').reset_index(drop=True)


# -----------------------------
# Subset 1 days
# -----------------------------
start_date = '2025-07-01'
end_date = '2025-07-02'  # exclusive
df = df[(df['date_time'] >= start_date) & (df['date_time'] < end_date)].reset_index(drop=True)
print("Number of ticks after date filtering:", len(df))

# -----------------------------
# Downsample every 50th tick
# -----------------------------
df = df.iloc[::50].reset_index(drop=True)
print("Number of ticks after downsampling:", len(df))


Number of ticks after date filtering: 272319
Number of ticks after downsampling: 5447


In [44]:

# -----------------------------
# 3. Base Features
# -----------------------------
# Mid-price
df['mid_price'] = (df['bid'] + df['ask']) / 2

# Spread
df['spread'] = df['ask'] - df['bid']

# Price changes (momentum)
df['delta_bid'] = df['bid'].diff()
df['delta_ask'] = df['ask'].diff()
df['delta_mid'] = df['mid_price'].diff()

# Rolling statistics
df['roll_mean_5'] = df['mid_price'].rolling(5).mean()
df['roll_std_50'] = df['mid_price'].rolling(50).std()
df['roll_mean_10'] = df['mid_price'].rolling(10).mean()

# Drop initial NaNs from rolling windows
df = df.dropna().reset_index(drop=True)



In [45]:
# -----------------------------
# 4. Normalize Features
# -----------------------------
features = ['bid', 'ask', 'mid_price', 'spread', 'delta_bid', 'delta_ask', 'delta_mid', 'roll_mean_5', 'roll_mean_10', 'roll_std_50']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])



In [46]:
# -----------------------------
# 5. Create sequence windows
# -----------------------------
sequence_length = 50
X_sequences = []
timestamps = []

for i in range(len(df) - sequence_length):
    X_sequences.append(df[features].iloc[i:i+sequence_length].values)
    timestamps.append(df['date_time'].iloc[i+sequence_length-1])

X_sequences = np.array(X_sequences)
timestamps = np.array(timestamps)

print("Sequences shape:", X_sequences.shape)
print("Timestamps shape:", timestamps.shape)

Sequences shape: (5348, 50, 10)
Timestamps shape: (5348,)


In [47]:
# -----------------------------
# 6. Add Portfolio / Budget Placeholder
# -----------------------------
# In RL, state will also include current cash and USD holdings
# For now, initialize as zeros (will be updated in RL environment)
num_samples = X_sequences.shape[0]
portfolio_features = np.zeros((num_samples, 2))  # [cash, usd_holdings]

# Combine with sequence features if needed
# Depending on RL implementation, this can be added as extra features or separate input



In [48]:
# -----------------------------
# 7. Save Feature Dataset
# -----------------------------
np.save(os.getenv("DATA_DIR_PROCESSED")+"/X_sequences.npy", X_sequences)
np.save(os.getenv("DATA_DIR_PROCESSED")+"/timestamps.npy", timestamps)
np.save(os.getenv("DATA_DIR_PROCESSED")+"/portfolio_features.npy", portfolio_features)

print("Feature sequences saved in data/processed/")


Feature sequences saved in data/processed/
