In [2]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

RANDOM_STATE = 42

## Section 1: Load Dataset

In [3]:
df_model = pd.read_parquet("df_model_top3_by_sector.parquet")

print("df_model shape:", df_model.shape)
print(df_model.columns)
df_model.head()

df_model shape: (39549, 75)
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Ticker', 'SMA_5', 'SMA_10', 'SMA_20', 'SMA_50',
       'EMA_12', 'EMA_26', 'MACD', 'MACD_Signal', 'MACD_Histogram', 'RSI',
       'BB_Middle', 'BB_Upper', 'BB_Lower', 'BB_Width', 'BB_Position',
       'Volatility', 'Price_Change', 'Price_Change_5d', 'High_Low_Ratio',
       'Open_Close_Ratio', 'Volume_SMA', 'Volume_Ratio', 'Close_lag_1',
       'Close_lag_2', 'Close_lag_3', 'Close_lag_5', 'Close_lag_10',
       'Volume_lag_1', 'Volume_lag_2', 'Volume_lag_3', 'Volume_lag_5',
       'Volume_lag_10', 'Price_Change_lag_1', 'Price_Change_lag_2',
       'Price_Change_lag_3', 'Price_Change_lag_5', 'Price_Change_lag_10',
       'RSI_lag_1', 'RSI_lag_2', 'RSI_lag_3', 'RSI_lag_5', 'RSI_lag_10',
       'MACD_lag_1', 'MACD_lag_2', 'MACD_lag_3', 'MACD_lag_5', 'MACD_lag_10',
       'Volatility_lag_1', 'Volatility_lag_2', 'Volatility_lag_3',
       'Volatility_lag_5', 'Volatility_

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,SMA_5,...,Future_Up_5d,Future_Category_5d,Future_Return_10d,Future_Up_10d,Future_Category_10d,Future_Return_20d,Future_Up_20d,Future_Category_20d,Sector,Dollar_volume
0,2020-07-15 00:00:00-04:00,75.70745,76.358544,74.445025,75.391846,31026000,0.0,0.0,GOOGL,75.622864,...,1,3.0,0.004371,1,2.0,-0.006355,0,1.0,Communication Services,2339107000.0
1,2020-07-16 00:00:00-04:00,74.627436,75.343646,73.969881,75.294441,26484000,0.0,0.0,GOOGL,75.585687,...,1,2.0,0.015479,1,2.0,0.001142,1,2.0,Communication Services,1994098000.0
2,2020-07-17 00:00:00-04:00,75.393331,75.717391,74.458439,75.39035,34264000,0.0,0.0,GOOGL,75.365408,...,0,1.0,-0.019053,0,1.0,-0.008056,0,1.0,Communication Services,2583175000.0
3,2020-07-20 00:00:00-04:00,75.298402,77.982803,74.687071,77.725845,30166000,0.0,0.0,GOOGL,75.878429,...,0,0.0,-0.051847,0,0.0,-0.030438,0,0.0,Communication Services,2344678000.0
4,2020-07-21 00:00:00-04:00,78.779035,78.879431,77.168189,77.332207,27486000,0.0,0.0,GOOGL,76.226938,...,0,0.0,-0.0531,0,0.0,-9e-05,0,1.0,Communication Services,2125553000.0


### 1.2 Data integrity and ordering

In [4]:
# sort by time (recommended)
if "Date" in df_model.columns and "Ticker" in df_model.columns:
    df_model = df_model.sort_values(["Ticker", "Date"]).reset_index(drop=True)

df_model.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,SMA_5,...,Future_Up_5d,Future_Category_5d,Future_Return_10d,Future_Up_10d,Future_Category_10d,Future_Return_20d,Future_Up_20d,Future_Category_20d,Sector,Dollar_volume
0,2020-07-15 00:00:00-04:00,96.225135,96.475443,93.794962,94.995468,153198000,0.0,0.0,AAPL,93.694342,...,0,1.0,-0.027475,0,0.0,0.158493,1,3.0,Technology,14553120000.0
1,2020-07-16 00:00:00-04:00,93.865429,94.684396,93.226293,93.826546,110577600,0.0,0.0,AAPL,93.844038,...,0,0.0,-0.003445,0,1.0,0.193684,1,3.0,Technology,10375110000.0
2,2020-07-17 00:00:00-04:00,94.278547,94.434074,93.163092,93.636978,92186800,0.0,0.0,AAPL,93.923262,...,0,0.0,0.103112,1,3.0,0.195034,1,3.0,Technology,8632093000.0
3,2020-07-20 00:00:00-04:00,93.724481,95.748812,93.379394,95.610291,90318000,0.0,0.0,AAPL,94.483176,...,0,0.0,0.107567,1,3.0,0.167314,1,3.0,Technology,8635330000.0
4,2020-07-21 00:00:00-04:00,96.402524,96.477858,94.040395,94.290703,103433200,0.0,0.0,AAPL,94.471997,...,0,0.0,0.130567,1,3.0,0.193514,1,3.0,Technology,9752789000.0


### 1.3: Feature and target selection

## Section 2: Motivation and baseline model
### 2.1: Motivation
Logistic Regression provides an interpretable linear baseline. If it performs similarly to Random Forest, this suggests limited non-linear structure in the feature–target relationship.

### 2.2: 