# Feature Engineering v3 (5DAY TARGETED, Reduced Features)

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option("display.max_columns", None)

df = pd.read_csv("../data/processed/new_QQQ_data.csv",
                 parse_dates=["Date"], index_col="Date")
# create returns needed for some features
df["returns"] = df["Close"].pct_change()

In [3]:
def compute_atr(df, window=5):
    high_low = df["High"] - df["Low"]
    high_close = (df["High"] - df["Close"].shift()).abs()
    low_close  = (df["Low"] - df["Close"].shift()).abs()
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    return tr.rolling(window).mean()

In [4]:
# Cell 3 - short-term features (5-day horizon focus)
# Volatility
df["ATR5"] = compute_atr(df, 5)
df["std5"] = df["returns"].rolling(5).std()
df["norm_tr"] = (df["High"] - df["Low"]) / df["Close"]
df["vol_ratio_5"] = df["ATR5"] / df["std5"]

# Momentum (fast)
df["roc2"] = df["Close"].pct_change(2)
df["roc5"] = df["Close"].pct_change(5)
df["rsi7"] = None
def compute_rsi(series, window=7):
    delta = series.diff()
    gain = delta.clip(lower=0).rolling(window).mean()
    loss = (-delta).clip(lower=0).rolling(window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))
df["rsi7"] = compute_rsi(df["Close"], 7)

# Short trend bias
df["MA5"] = df["Close"].rolling(5).mean()
df["MA10"] = df["Close"].rolling(10).mean()

# Candle & volume
df["candle_body_pct"] = ((df["Close"] - df["Open"]).abs() / (df["High"] - df["Low"]).replace(0, np.nan)).fillna(0)
df["vol_ma5"] = df["Volume"].rolling(5).mean()
df["vol_spike_5"] = df["Volume"] / df["vol_ma5"]

# Lagged returns (keep only 1-day and 3-day maybe; we will drop redundant later)
df["return_1d"] = df["Close"].pct_change(1)
df["return_3d"] = df["Close"].pct_change(3)


In [5]:
# Cell 4 - 5-day target (classification) and drop NA
df["future_5d_return"] = df["Close"].shift(-5) / df["Close"] - 1
df["target"] = (df["future_5d_return"] > 0).astype(int)

# Final feature list (reduced)
final_features = [
    "ATR5", "std5", "norm_tr", "vol_ratio_5",
    "roc2", "roc5", "rsi7",
    "MA5", "MA10",
    "candle_body_pct",
    "vol_spike_5",
    "return_1d",
    "target"
]

df_clean = df[final_features + ["future_5d_return"]].dropna().copy()
df_final = df_clean[final_features].copy()  # keep target + features

# quick checks
print("Rows: ", df_final.shape[0])
print(df_final.target.value_counts(normalize=True))

# save
df_final.to_csv("../data/processed/qqq_features_v3_5d.csv")
print("C:/Users/USER/Documents/QQQ Project/data/processed/qqq_features_classificationv3.csv")


Rows:  6701
target
1    0.576033
0    0.423967
Name: proportion, dtype: float64
C:/Users/USER/Documents/QQQ Project/data/processed/qqq_features_classificationv3.csv
