In [9]:
import numpy as np
import pandas as pd
import yfinance as yf
import pandas_ta as ta
import matplotlib.pyplot as plt
import tabulate as tb

In [10]:
TICKER = "AAPL" # ^GSPC, ^NDX, AAPL, GOOGL
START_DATE = "2023-12-21"
END_DATE = None
TIMEFRAME = "1h"    # 1d, 1h, 15m, 5m
PATH = f'./../data/{TICKER}_{TIMEFRAME}.csv'
THRESHOLD_DELTA = 0.004

In [11]:
df = yf.download(
    TICKER, 
    start=START_DATE,
    end=END_DATE,
    interval=TIMEFRAME,
    progress=False
)

if isinstance(df.columns, pd.MultiIndex):
    df.columns = [col[0] for col in df.columns]

df.dropna(inplace=True)
print(tb.tabulate(df.tail(), headers='keys', tablefmt='psql'))

  df = yf.download(


+---------------------------+---------+---------+---------+---------+-------------+
| Datetime                  |   Close |    High |     Low |    Open |      Volume |
|---------------------------+---------+---------+---------+---------+-------------|
| 2025-11-26 19:30:00+00:00 |  278.08 | 278.94  | 278.01  | 278.26  | 1.95325e+06 |
| 2025-11-26 20:30:00+00:00 |  277.47 | 278.3   | 277.15  | 278.07  | 9.116e+06   |
| 2025-11-28 14:30:00+00:00 |  276.26 | 278.239 | 276.02  | 277.26  | 4.04046e+06 |
| 2025-11-28 15:30:00+00:00 |  276.25 | 276.92  | 275.99  | 276.29  | 1.74384e+06 |
| 2025-11-28 16:30:00+00:00 |  277.05 | 277.16  | 275.987 | 276.245 | 1.79141e+06 |
+---------------------------+---------+---------+---------+---------+-------------+


In [12]:
df["rsi_14"] = ta.rsi(df["Close"], length=14)
df["rsi_28"] = ta.rsi(df["Close"], length=28)
df["rsi_50"] = ta.rsi(df["Close"], length=50)
df["rsi_7"] = ta.rsi(df["Close"], length=7)

macd = ta.macd(df["Close"])
# df = df.join(macd)
df["macd"] = macd["MACD_12_26_9"]

df["ema_10"] = ta.ema(df["Close"], length=10)
df["ema_20"] = ta.ema(df["Close"], length=20)
df["ema_50"] = ta.ema(df["Close"], length=50)
df["ema_100"] = ta.ema(df["Close"], length=100)
df["ema_200"] = ta.ema(df["Close"], length=50)

# Stochastic Oscillator
stoch = ta.stoch(df["High"], df["Low"], df["Close"])
df["stoch_k"] = stoch["STOCHk_14_3_3"]
df["stoch_d"] = stoch["STOCHd_14_3_3"]

df["roc"] = ta.roc(close=df["Close"], length=10)

adx = ta.adx(
    high=df["High"],
    low=df["Low"],
    close=df["Close"],
    length=14
)
df["adx"]  = adx["ADX_14"]
df["di_plus"]  = adx["DMP_14"] 
df["di_minus"] = adx["DMN_14"]  

df["atr_14"] = ta.atr(
    high=df["High"],
    low=df["Low"],
    close=df["Close"],
    length=14
)
df["atr_20"] = ta.atr(
    high=df["High"],
    low=df["Low"],
    close=df["Close"],
    length=20
)

df["close_pos"] = (df["Close"] - df["Low"]) / (df["High"] - df["Low"])

df["body_range_ratio"] = (df["Close"] - df["Open"]).abs() / (df["High"] - df["Low"])

# --- Volume Z-Score(50) ---
df["volume_zscore_50"] = (df["Volume"] - df["Volume"].rolling(50).mean()) / df["Volume"].rolling(50).std()

# --- Bollinger Bands(20) ---
bb = ta.bbands(df["Close"], length=20, std=2)

col_lower  = next(c for c in bb.columns if c.startswith("BBL_20"))
col_middle = next(c for c in bb.columns if c.startswith("BBM_20"))
col_upper  = next(c for c in bb.columns if c.startswith("BBU_20"))

df["bb_lower_20"] = bb[col_lower]
df["bb_middle_20"] = bb[col_middle]
df["bb_upper_20"]  = bb[col_upper]

# --- Bollinger Bands Width (20) ---
df["bb_width_20"] = (df["bb_upper_20"] - df["bb_lower_20"]) / df["bb_middle_20"]

# --- On-Balance Volume (OBV) ---
df["obv"] = ta.obv(df["Close"], df["Volume"])

# ATR measures market volatility over a period
df["atr_5"] = ta.atr(df["High"], df["Low"], df["Close"], length=5)

# --- B. Momentum/Change: Price Difference over Time Step (Log Return) ---
# This explicitly models the speed of price movement
df["log_returns"] = np.log(df["Close"] / df["Close"].shift(1))

# --- C. Relative High/Low Position (Rolling Max/Min) ---
# Measures how far the current price is from its recent high/low
df["rolling_max_20"] = df["Close"].rolling(window=20).max()
df["rolling_min_20"] = df["Close"].rolling(window=20).min()
df["price_from_20d_high"] = df["Close"] / df["rolling_max_20"]

df["returns"] = df["Close"].pct_change()

def define_ternary_target(returns, delta):
    if returns >= delta:
        return 2  # Up
    elif returns <= -delta:
        return 0  # Down
    else:
        return 1  # Flat / Hold

df['direction'] = df['returns'].apply(lambda x: define_ternary_target(x, THRESHOLD_DELTA))

df.dropna(inplace=True)

target_counts = df['direction'].value_counts(normalize=True).sort_index()
print(f"\nDirection Class Distribution:\n{tb.tabulate(target_counts.reset_index(), headers=['Class (0=Down, 1=Flat, 2=Up)', 'Proportion'], tablefmt='psql')}")


print(tb.tabulate(df.head(), headers='keys', tablefmt='psql'))
print(tb.tabulate(df.tail(), headers='keys', tablefmt='psql'))

df.to_csv(PATH)


Direction Class Distribution:
+----+--------------------------------+--------------+
|    |   Class (0=Down, 1=Flat, 2=Up) |   Proportion |
|----+--------------------------------+--------------|
|  0 |                              0 |     0.142553 |
|  1 |                              1 |     0.707585 |
|  2 |                              2 |     0.149863 |
+----+--------------------------------+--------------+
+---------------------------+---------+---------+--------+---------+-------------+----------+----------+----------+---------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------------+---------+-----------+------------+----------+----------+-------------+--------------------+--------------------+---------------+----------------+---------------+---------------+--------------+----------+---------------+------------------+------------------+-----------------------+--------------+-------------+
| Datetime                  |   Close |   