In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import warnings

warnings.filterwarnings("ignore")

# Load & sort data
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Create lagged‐return feature
df['return_1d_lag1'] = df.groupby('symbol')['return_1d'].shift(1)
# drop the first row of each symbol (where lag is NaN)
df = df.dropna(subset=['return_1d_lag1']).reset_index(drop=True)

# Chronological splits
train = df[df['date'] <= '2021-12-31'].copy()
val   = df[(df['date'] >  '2021-12-31') & (df['date'] <= '2022-12-31')].copy()
test  = df[df['date'] >  '2022-12-31'] .copy()

print(f"Train: {train.shape[0]} rows | Val: {val.shape[0]} rows | Test: {test.shape[0]} rows")

# Define features & target
price_feats = [
    'adj close',
    'log_volume',
    'ma_10',
    'vol_30',
    'rsi_14',
    'return_1d_lag1'
]
weekday    = ['day_of_week']
target     = 'target'

# Build preprocessing pipeline
preprocessor = ColumnTransformer([
    # Standard‐scale all continuous price features (including our lag)
    ('scale_price',    StandardScaler(), price_feats),
    # One‐hot encode weekday (drop first to avoid collinearity)
    ('onehot_weekday', OneHotEncoder(drop='first', sparse_output=False), weekday),
], remainder='drop')

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf',          LogisticRegression(
                         random_state=42,
                         max_iter=1000,
                         solver='lbfgs'
                     ))
])

# Fit on training set
X_train = train[price_feats + weekday]
y_train = train[target]
pipeline.fit(X_train, y_train)

# Evaluate on test set
X_test  = test[price_feats + weekday]
y_test  = test[target]
y_pred  = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("\nLogistic Regression Performance on Test Set:")
print(f"  Accuracy   : {accuracy_score(y_test, y_pred):.4f}")
print(f"  Precision  : {precision_score(y_test, y_pred):.4f}")
print(f"  Recall     : {recall_score(y_test, y_pred):.4f}")
print(f"  F1 Score   : {f1_score(y_test, y_pred):.4f}")
print(f"  ROC AUC    : {roc_auc_score(y_test, y_proba):.4f}")
print("  Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Train: 3636876 rows | Val: 644115 rows | Test: 571664 rows

Logistic Regression Performance on Test Set:
  Accuracy   : 0.5201
  Precision  : 0.5152
  Recall     : 0.8151
  F1 Score   : 0.6314
  ROC AUC    : 0.5203
  Confusion Matrix:
[[ 62424 221018]
 [ 53302 234920]]
