In [53]:
# Block 1: Setup and Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

print("Setup complete.")

Setup complete.


In [68]:
# Block 2: Load and concatenate CSV data
import os
from glob import glob

# Replace with the folder where your CSVs are stored
data_folder = "."

# Grab all CSVs in that folder
csv_files = sorted(glob(os.path.join(data_folder, "*.csv")))

# Read and concatenate all CSVs
all_data = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Convert timestamp to datetime
all_data["timestamp"] = pd.to_datetime(all_data["timestamp"])

df = add_technical_indicators(df)

# Sort just in case
all_data = all_data.sort_values("timestamp").reset_index(drop=True)

print("Data loaded:", all_data["timestamp"].min(), "to", all_data["timestamp"].max())

KeyError: 'close_935'

In [55]:
# Block 3 Create a new DataFrame to hold daily rows
daily_rows = []

# Group by date
for date, group in all_data.groupby(all_data["timestamp"].dt.date):
    # Get all times for this day
    group = group.sort_values("timestamp")
    
    # Extract 9:30 and 9:35 data
    open_row = group[group["timestamp"].dt.time == pd.to_datetime("09:30").time()]
    after_5min_row = group[group["timestamp"].dt.time == pd.to_datetime("09:35").time()]
    
    # Only include rows where both exist
    if not open_row.empty and not after_5min_row.empty:
        row = {
            "date": pd.to_datetime(date),
            "close_930": open_row.iloc[0]["close"],
            "close_935": after_5min_row.iloc[0]["close"],
            "volume": after_5min_row.iloc[0]["volume"]
        }
        daily_rows.append(row)

# Create model_df
model_df = pd.DataFrame(daily_rows)

# Target label: did it go up?
model_df["went_up"] = (model_df["close_935"] > model_df["close_930"]).astype(int)

# Preview
print(model_df.head())

        date  close_930  close_935   volume  went_up
0 2020-05-11     291.85     291.94   4901.0        1
1 2020-05-13     286.14     286.39  12385.0        1
2 2020-05-14     281.69     281.74   4600.0        1
3 2020-05-15     285.72     285.45   3130.0        0
4 2020-05-18     288.97     289.00  23876.0        1


In [67]:
# Block 4: Calendar Features + Target
from pandas.tseries.holiday import USFederalHolidayCalendar
model_df = build_calendar_features(all_data)

df = all_data.copy()
df["date"] = df["timestamp"].dt.date
df["time"] = df["timestamp"].dt.time

# Extract 9:30 and 9:35 closes
df_930 = df[df["timestamp"].dt.time == pd.to_datetime("09:30").time()]
df_935 = df[df["timestamp"].dt.time == pd.to_datetime("09:35").time()]
df_930 = df_930.groupby("date")["close"].first().reset_index().rename(columns={"close": "close_930"})
df_935 = df_935.groupby("date")["close"].first().reset_index().rename(columns={"close": "close_935"})
model_df = pd.merge(df_930, df_935, on="date", how="inner")

# Calendar features
model_df["date"] = pd.to_datetime(model_df["date"])
model_df["day_of_week"] = model_df["date"].dt.weekday
model_df["is_monday"] = (model_df["day_of_week"] == 0).astype(int)
model_df["is_friday"] = (model_df["day_of_week"] == 4).astype(int)
model_df["month"] = model_df["date"].dt.month
model_df["year"] = model_df["date"].dt.year
model_df["quarter"] = model_df["date"].dt.quarter
model_df["is_month_start"] = model_df["date"].dt.is_month_start.astype(int)
model_df["is_month_end"] = model_df["date"].dt.is_month_end.astype(int)
model_df["is_quarter_start"] = model_df["date"].dt.is_quarter_start.astype(int)
model_df["is_quarter_end"] = model_df["date"].dt.is_quarter_end.astype(int)

# Season encoding
def get_season(month):
    if month in [12, 1, 2]: return "Winter"
    elif month in [3, 4, 5]: return "Spring"
    elif month in [6, 7, 8]: return "Summer"
    else: return "Fall"
model_df["season"] = model_df["month"].apply(get_season)
model_df = pd.get_dummies(model_df, columns=["season"], prefix="season")

# Holiday awareness
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=model_df["date"].min(), end=model_df["date"].max())
model_df["is_holiday"] = model_df["date"].isin(holidays).astype(int)
model_df["prev_day"] = model_df["date"] - pd.Timedelta(days=1)
model_df["is_after_holiday"] = model_df["prev_day"].isin(holidays).astype(int)

# First trading day of each month
model_df["date_only"] = model_df["date"].dt.date
first_of_month = model_df.groupby(model_df["date"].dt.to_period("M"))["date"].min().values
model_df["is_first_trading_day"] = model_df["date"].isin(first_of_month).astype(int)

# Target
model_df["went_up"] = (model_df["close_935"] > model_df["close_930"]).astype(int)

# Drop unused
model_df.drop(columns=["date_only", "prev_day"], inplace=True)

NameError: name 'build_calendar_features' is not defined

In [64]:
# Block 6: Add Previous Day Features

# Step 1: Create previous day reference
model_df["prev_day"] = model_df["date"].shift(1)

# Step 2: Create a DataFrame with previous day's open and close
df_prev = model_df[["date", "close_930"]].copy()
df_prev.columns = ["prev_day", "prev_day_open"]

df_prev["prev_day_close"] = model_df["close_935"].shift(1).values

# Step 3: Merge previous day data
model_df = pd.merge(model_df, df_prev, on="prev_day", how="left")

# Step 4: Create previous day return
model_df["prev_day_return"] = (
    (model_df["prev_day_close"] - model_df["prev_day_open"]) / model_df["prev_day_open"]
)

# Step 5: Drop early NaNs
model_df = model_df.dropna(subset=["prev_day_return"]).reset_index(drop=True)

# Confirm columns
print("Columns after adding prev day features:", model_df.columns.tolist())

Columns after adding prev day features: ['date', 'close_930', 'close_935', 'day_of_week', 'is_monday', 'is_friday', 'month', 'year', 'quarter', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter', 'is_holiday', 'is_after_holiday', 'is_first_trading_day', 'went_up', 'prev_day', 'prev_day_open', 'prev_day_close', 'prev_day_return']


In [65]:
# Block 7: Add Technical Indicators

def add_technical_indicators(df):
    # RSI
    delta = df["close_935"].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df["rsi_14"] = 100 - (100 / (1 + rs))

    # SMA and ratios
    df["sma_5"] = df["close_935"].rolling(window=5).mean()
    df["sma_20"] = df["close_935"].rolling(window=20).mean()
    df["sma_ratio"] = df["sma_5"] / df["sma_20"]
    df["sma_distance"] = (df["close_935"] - df["sma_20"]) / df["sma_20"]

    # Volatility
    df["prior_volatility"] = df["close_930"].rolling(window=5).std()

    # Bollinger Bands
    df["bb_middle"] = df["sma_20"]
    df["bb_std"] = df["close_935"].rolling(window=20).std()
    df["bb_upper"] = df["bb_middle"] + 2 * df["bb_std"]
    df["bb_lower"] = df["bb_middle"] - 2 * df["bb_std"]
    df["bollinger_width"] = (df["bb_upper"] - df["bb_lower"]) / df["bb_middle"]
    df["bollinger_position"] = (df["close_935"] - df["bb_lower"]) / (df["bb_upper"] - df["bb_lower"])

    # MACD
    ema12 = df["close_935"].ewm(span=12, adjust=False).mean()
    ema26 = df["close_935"].ewm(span=26, adjust=False).mean()
    df["macd"] = ema12 - ema26
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
    df["macd_diff"] = df["macd"] - df["macd_signal"]

    # Overnight gap
    df["overnight_gap"] = (df["close_930"] - df["prev_day_close"]) / df["prev_day_close"]

    # Previous day movement
    df["prev_day_change"] = (df["prev_day_close"] - df["prev_day_open"]) / df["prev_day_open"]
    df["prev_day_range_pct"] = (df["prev_day_close"] - df["prev_day_open"]).abs() / df["prev_day_open"]

    return df

model_df = add_technical_indicators(model_df)
model_df = model_df.dropna().reset_index(drop=True)

print("Technical indicators added. Columns now:", model_df.columns.tolist())

Technical indicators added. Columns now: ['date', 'close_930', 'close_935', 'day_of_week', 'is_monday', 'is_friday', 'month', 'year', 'quarter', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter', 'is_holiday', 'is_after_holiday', 'is_first_trading_day', 'went_up', 'prev_day', 'prev_day_open', 'prev_day_close', 'prev_day_return', 'rsi_14', 'sma_5', 'sma_20', 'sma_ratio', 'sma_distance', 'prior_volatility', 'bb_middle', 'bb_std', 'bb_upper', 'bb_lower', 'bollinger_width', 'bollinger_position', 'macd', 'macd_signal', 'macd_diff', 'overnight_gap', 'prev_day_change', 'prev_day_range_pct']


In [59]:
from glob import glob
import os
import pandas as pd

data_folder = "."  # or your actual folder name
csv_files = sorted(glob(os.path.join(data_folder, "*.csv")))

# Read and concatenate all CSVs
full_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Convert timestamp
full_df["timestamp"] = pd.to_datetime(full_df["timestamp"])
full_df["date"] = full_df["timestamp"].dt.date
full_df["time"] = full_df["timestamp"].dt.strftime("%H:%M")

# Filter to keep only 9:30 and 9:35 rows
full_df = full_df[full_df["time"].isin(["09:30", "09:35"])].reset_index(drop=True)

print("Filtered to 9:30 and 9:35 AM rows only:", full_df.shape)

Filtered to 9:30 and 9:35 AM rows only: (4856, 10)


In [60]:
model_df["date"] = pd.to_datetime(model_df["date"])
full_df["date"] = pd.to_datetime(full_df["date"])

In [61]:
# Block 8: Add Volume-Based Features

# Step 1: Compute volume rolling stats using only 09:35 rows
volume_df = full_df[full_df["time"] == "09:35"].copy()
volume_df = volume_df.sort_values("timestamp")  # make sure it's sorted
volume_df["volume_5day_avg"] = volume_df["volume"].rolling(window=5).mean()
volume_df["volume_5day_ratio"] = volume_df["volume"] / volume_df["volume_5day_avg"]

# Step 2: Now this is our model_df (the one we’ll use from now on)
model_df = volume_df.reset_index(drop=True)

# Step 3: Drop early NaNs due to rolling window
model_df = model_df.dropna(subset=["volume_5day_avg", "volume_5day_ratio"])

print("Volume features added. Columns now:", model_df.columns.tolist())

Volume features added. Columns now: ['volume', 'vw', 'open', 'close', 'high', 'low', 'timestamp', 'trades', 'date', 'time', 'volume_5day_avg', 'volume_5day_ratio']


In [66]:
# Block 9: Merge engineered features into 09:35 filtered model_df

# Step 1: Define all engineered columns we want to include
calendar_and_tech_cols = [
    "date", "close_930", "close_935", "day_of_week", "is_monday", "is_friday",
    "month", "year", "quarter", "is_month_start", "is_month_end",
    "is_quarter_start", "is_quarter_end", "season_Winter", "season_Spring",
    "season_Summer", "season_Fall", "is_holiday", "is_after_holiday",
    "is_first_trading_day", "went_up", "prev_day", "prev_day_open",
    "prev_day_close", "prev_day_return", "rsi_14", "sma_5", "sma_20",
    "sma_ratio", "sma_distance", "prior_volatility", "bb_middle", "bb_std",
    "bb_upper", "bb_lower", "bollinger_width", "bollinger_position",
    "macd", "macd_signal", "macd_diff", "overnight_gap",
    "prev_day_change", "prev_day_range_pct"
]

# Step 2: Merge the engineered feature data from `df` into `model_df` (which is only 9:35 rows)
model_df = pd.merge(
    model_df,
    df[calendar_and_tech_cols],
    on="date",
    how="left"
)

# Step 3: Drop any rows with missing values
model_df = model_df.dropna().reset_index(drop=True)

# Check structure
print("Merged feature-rich model_df shape:", model_df.shape)
print("Columns now:", model_df.columns.tolist())

KeyError: "['close_930', 'close_935', 'day_of_week', 'is_monday', 'is_friday', 'month', 'year', 'quarter', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'season_Winter', 'season_Spring', 'season_Summer', 'season_Fall', 'is_holiday', 'is_after_holiday', 'is_first_trading_day', 'went_up', 'prev_day', 'prev_day_open', 'prev_day_close', 'prev_day_return', 'rsi_14', 'sma_5', 'sma_20', 'sma_ratio', 'sma_distance', 'prior_volatility', 'bb_middle', 'bb_std', 'bb_upper', 'bb_lower', 'bollinger_width', 'bollinger_position', 'macd', 'macd_signal', 'macd_diff', 'overnight_gap', 'prev_day_change', 'prev_day_range_pct'] not in index"

In [48]:
# Step 1: Choose refined feature set
refined_features = [
    "is_quarter_start", "is_month_start", "is_quarter_end", "is_month_end",
    "day_of_week", "month", "year", "season_Winter", "season_Spring",
    "season_Summer", "season_Fall", "is_holiday", "is_after_holiday",
    "is_first_trading_day", "overnight_gap", "sma_ratio", "sma_distance",
    "prior_volatility", "rsi_14", "bollinger_width", "bollinger_position",
    "macd_diff", "prev_day_change", "prev_day_range_pct", "volume_5day_ratio"
]

# Step 2: Filter model_df to include only selected features + target
model_df_final = model_df[refined_features + ["went_up"]].dropna()

# Step 3: Create X (features) and y (target)
X = model_df_final[refined_features]
y = model_df_final["went_up"]

# Step 4: Train/test split (by date order, 80/20)
split_index = int(len(model_df_final) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Preview
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

KeyError: "['is_quarter_start', 'is_month_start', 'is_quarter_end', 'is_month_end', 'day_of_week', 'month', 'year', 'season_Winter', 'season_Spring', 'season_Summer', 'season_Fall', 'is_holiday', 'is_after_holiday', 'is_first_trading_day', 'overnight_gap', 'sma_ratio', 'sma_distance', 'prior_volatility', 'rsi_14', 'bollinger_width', 'bollinger_position', 'macd_diff', 'prev_day_change', 'prev_day_range_pct', 'went_up'] not in index"