In [2]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

RANDOM_STATE = 42

## Section 1: Load Dataset

In [3]:
df_model = pd.read_parquet("df_model_top3_by_sector.parquet")

print("df_model shape:", df_model.shape)
print(df_model.columns)
df_model.head()

df_model shape: (39549, 75)
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Ticker', 'SMA_5', 'SMA_10', 'SMA_20', 'SMA_50',
       'EMA_12', 'EMA_26', 'MACD', 'MACD_Signal', 'MACD_Histogram', 'RSI',
       'BB_Middle', 'BB_Upper', 'BB_Lower', 'BB_Width', 'BB_Position',
       'Volatility', 'Price_Change', 'Price_Change_5d', 'High_Low_Ratio',
       'Open_Close_Ratio', 'Volume_SMA', 'Volume_Ratio', 'Close_lag_1',
       'Close_lag_2', 'Close_lag_3', 'Close_lag_5', 'Close_lag_10',
       'Volume_lag_1', 'Volume_lag_2', 'Volume_lag_3', 'Volume_lag_5',
       'Volume_lag_10', 'Price_Change_lag_1', 'Price_Change_lag_2',
       'Price_Change_lag_3', 'Price_Change_lag_5', 'Price_Change_lag_10',
       'RSI_lag_1', 'RSI_lag_2', 'RSI_lag_3', 'RSI_lag_5', 'RSI_lag_10',
       'MACD_lag_1', 'MACD_lag_2', 'MACD_lag_3', 'MACD_lag_5', 'MACD_lag_10',
       'Volatility_lag_1', 'Volatility_lag_2', 'Volatility_lag_3',
       'Volatility_lag_5', 'Volatility_

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,SMA_5,...,Future_Up_5d,Future_Category_5d,Future_Return_10d,Future_Up_10d,Future_Category_10d,Future_Return_20d,Future_Up_20d,Future_Category_20d,Sector,Dollar_volume
0,2020-07-15 00:00:00-04:00,75.70745,76.358544,74.445025,75.391846,31026000,0.0,0.0,GOOGL,75.622864,...,1,3.0,0.004371,1,2.0,-0.006355,0,1.0,Communication Services,2339107000.0
1,2020-07-16 00:00:00-04:00,74.627436,75.343646,73.969881,75.294441,26484000,0.0,0.0,GOOGL,75.585687,...,1,2.0,0.015479,1,2.0,0.001142,1,2.0,Communication Services,1994098000.0
2,2020-07-17 00:00:00-04:00,75.393331,75.717391,74.458439,75.39035,34264000,0.0,0.0,GOOGL,75.365408,...,0,1.0,-0.019053,0,1.0,-0.008056,0,1.0,Communication Services,2583175000.0
3,2020-07-20 00:00:00-04:00,75.298402,77.982803,74.687071,77.725845,30166000,0.0,0.0,GOOGL,75.878429,...,0,0.0,-0.051847,0,0.0,-0.030438,0,0.0,Communication Services,2344678000.0
4,2020-07-21 00:00:00-04:00,78.779035,78.879431,77.168189,77.332207,27486000,0.0,0.0,GOOGL,76.226938,...,0,0.0,-0.0531,0,0.0,-9e-05,0,1.0,Communication Services,2125553000.0


### 1.2 Data integrity and ordering

In [4]:
# sort by time (recommended)
if "Date" in df_model.columns and "Ticker" in df_model.columns:
    df_model = df_model.sort_values(["Ticker", "Date"]).reset_index(drop=True)

df_model.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,SMA_5,...,Future_Up_5d,Future_Category_5d,Future_Return_10d,Future_Up_10d,Future_Category_10d,Future_Return_20d,Future_Up_20d,Future_Category_20d,Sector,Dollar_volume
0,2020-07-15 00:00:00-04:00,96.225135,96.475443,93.794962,94.995468,153198000,0.0,0.0,AAPL,93.694342,...,0,1.0,-0.027475,0,0.0,0.158493,1,3.0,Technology,14553120000.0
1,2020-07-16 00:00:00-04:00,93.865429,94.684396,93.226293,93.826546,110577600,0.0,0.0,AAPL,93.844038,...,0,0.0,-0.003445,0,1.0,0.193684,1,3.0,Technology,10375110000.0
2,2020-07-17 00:00:00-04:00,94.278547,94.434074,93.163092,93.636978,92186800,0.0,0.0,AAPL,93.923262,...,0,0.0,0.103112,1,3.0,0.195034,1,3.0,Technology,8632093000.0
3,2020-07-20 00:00:00-04:00,93.724481,95.748812,93.379394,95.610291,90318000,0.0,0.0,AAPL,94.483176,...,0,0.0,0.107567,1,3.0,0.167314,1,3.0,Technology,8635330000.0
4,2020-07-21 00:00:00-04:00,96.402524,96.477858,94.040395,94.290703,103433200,0.0,0.0,AAPL,94.471997,...,0,0.0,0.130567,1,3.0,0.193514,1,3.0,Technology,9752789000.0


## Section 2: Logistic Regression, Motivation and baseline model
### 2.1: Motivation
Logistic Regression is used as a linear baseline model to benchmark the performance of more complex approaches such as Random Forest. As a parametric model, Logistic Regression assumes a linear relationship between the input features and the log-odds of the target variable.

In financial prediction tasks, Logistic Regression is widely used due to its simplicity, interpretability, and robustness. If Logistic Regression performs comparably to Random Forest, this suggests that the underlying relationship between features and the target is largely linear and that additional model complexity provides limited benefit.

### 2.2: Feature and target selection

In [6]:
X = df_model[["Price_Change_5d"]].copy()
Y = df_model["Future_Up_5d"].copy()

# Remove missing values
tmp = pd.concat([X, Y], axis=1).dropna()
X = tmp[["Price_Change_5d"]]
Y = tmp["Future_Up_5d"]

print("Class balance:")
print(Y.value_counts(normalize=True))

Class balance:
Future_Up_5d
1    0.547245
0    0.452755
Name: proportion, dtype: float64


### 2.3: Train test split
A chronological train–test split is used to avoid look-ahead bias and to reflect real-world forecasting conditions.

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.30, shuffle=False
)

print(X_train.shape, X_test.shape)

(27684, 1) (11865, 1)


### 2.4: Baseline logistic regression model
Because Logistic Regression is sensitive to feature scaling, a standardization step is included using a pipeline.

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

lr = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=2000,
        random_state=42
    ))
])

lr.fit(X_train, Y_train)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,2000


### 2.5 Baseline model evaluation

In [9]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

Y_pred = lr.predict(X_test)
Y_prob = lr.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(Y_test, Y_pred)
roc_auc = roc_auc_score(Y_test, Y_prob)

print("Accuracy:", accuracy)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

Accuracy: 0.5527180783817952
ROC-AUC: 0.4957464673039969
Confusion Matrix:
 [[   0 5307]
 [   0 6558]]


The baseline Logistic Regression model achieves an accuracy of approximately 55%, which corresponds closely to the proportion of positive class observations in the dataset. However, the ROC-AUC is close to 0.5, indicating that the model has almost no ability to discriminate between upward and downward price movements.

The confusion matrix reveals that the model predicts the positive class for all observations, effectively collapsing to a majority-class classifier. This behavior suggests that the selected feature does not provide sufficient linear separation between classes.

### 2.6 Model interpretation

In [10]:
coef = lr.named_steps["model"].coef_[0][0]
intercept = lr.named_steps["model"].intercept_[0]

print("Intercept:", intercept)
print("Coefficient (Price_Change_5d):", coef)

Intercept: 0.18006685289274818
Coefficient (Price_Change_5d): -0.00020878456024218936


The estimated coefficient for Price_Change_5d is close to zero, indicating that recent 5-day price changes have negligible linear influence on the probability of a positive 5-day future return. In contrast, the intercept term dominates the prediction, reflecting the overall class imbalance in the data.

This confirms that Logistic Regression fails to identify a meaningful linear decision boundary based on the available feature.

### 2.7 Time-series cross validation
To assess robustness across different time periods, time-series cross-validation is applied using a walk-forward validation scheme.

In [11]:
from sklearn.model_selection import TimeSeriesSplit, cross_validate

tscv = TimeSeriesSplit(n_splits=5)

scoring = {
    "accuracy": "accuracy",
    "roc_auc": "roc_auc",
    "f1": "f1"
}

cv_results = cross_validate(
    lr, X, Y,
    cv=tscv,
    scoring=scoring,
    return_train_score=True
)

print("CV Accuracy: {:.4f} ± {:.4f}".format(
    cv_results["test_accuracy"].mean(),
    cv_results["test_accuracy"].std()
))
print("CV ROC-AUC: {:.4f} ± {:.4f}".format(
    cv_results["test_roc_auc"].mean(),
    cv_results["test_roc_auc"].std()
))

CV Accuracy: 0.5528 ± 0.0058
CV ROC-AUC: 0.4919 ± 0.0114


Time-series cross-validation yields highly stable accuracy and ROC-AUC values across folds. While accuracy remains around the majority-class baseline, the ROC-AUC consistently stays close to 0.5, confirming the absence of discriminative power.

The stability of the results across time periods indicates that the observed behavior is structural rather than due to overfitting or a particular data split.

 ### 2.8: Hyper Parameter tuning

In [12]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "model__C": np.logspace(-3, 3, 50)
}

search = RandomizedSearchCV(
    estimator=lr,
    param_distributions=param_dist,
    n_iter=15,
    scoring="roc_auc",
    cv=tscv,
    random_state=42,
    n_jobs=-1
)

search.fit(X, Y)

print("Best parameters:", search.best_params_)
print("Best CV ROC-AUC:", search.best_score_)

best_lr = search.best_estimator_

Best parameters: {'model__C': np.float64(0.03906939937054617)}
Best CV ROC-AUC: 0.4919269045196387


Hyperparameter tuning selects a very small regularization parameter, corresponding to strong coefficient shrinkage. This indicates that the model benefits from aggressive regularization in the presence of noisy financial data.

Despite tuning, cross-validated ROC-AUC does not improve meaningfully, suggesting that model complexity is not the limiting factor. Instead, the weak performance reflects the lack of a strong linear relationship between the feature and the target.

### 2.9: Tune parameter eval

In [13]:
Y_pred_tuned = best_lr.predict(X_test)
Y_prob_tuned = best_lr.predict_proba(X_test)[:, 1]

accuracy_tuned = accuracy_score(Y_test, Y_pred_tuned)
roc_auc_tuned = roc_auc_score(Y_test, Y_prob_tuned)

print("Tuned Accuracy:", accuracy_tuned)
print("Tuned ROC-AUC:", roc_auc_tuned)

Tuned Accuracy: 0.5527180783817952
Tuned ROC-AUC: 0.5042535326960031


The tuned Logistic Regression model achieves similar accuracy to the baseline model and only a marginal improvement in ROC-AUC on the hold-out test set. The confusion matrix remains dominated by positive class predictions, indicating that tuning does not alter the model’s qualitative behavior.

These results confirm that Logistic Regression remains unable to extract meaningful predictive structure from the data, even after optimization.

### 2.10: Conclusion
Overall, Logistic Regression proves to be an ineffective predictive model for 5-day stock price direction when using recent price changes as input. The model consistently collapses to a majority-class predictor, with ROC-AUC values near random guessing across validation and test sets.

This outcome highlights the limitations of linear models in capturing weak and potentially non-linear relationships in financial time series data and motivates the use of more flexible modeling approaches.

In [None]:
import pandas as pd
results = pd.DataFrame([{
    "model": "Random Forest",
    "roc_auc_mean": 0.5173,
    "roc_auc_std": 0.011566	,
    "accuracy_mean": 0.552966,
    "f1_mean": 0.698033
}])

results.to_csv("logreg_results.csv", index=False)