In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import warnings

warnings.filterwarnings("ignore")

# Load & sort data
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol', 'date']).reset_index(drop=True)

# Create a 1-day lagged return per symbol
df['return_1d_lag1'] = df.groupby('symbol')['return_1d'].shift(1)

# Drop rows with NaN lag
df = df.dropna(subset=['return_1d_lag1']).reset_index(drop=True)

# Chronological train/val/test split
train = df[df['date'] <= '2021-12-31'].copy()
val   = df[(df['date'] > '2021-12-31') & (df['date'] <= '2022-12-31')].copy()
test  = df[df['date'] > '2022-12-31'].copy()

print(f"Train: {train.shape[0]} rows | Val: {val.shape[0]} rows | Test: {test.shape[0]} rows")

# Define feature columns and target
price_feats = ['adj close', 'log_volume', 'ma_10', 'vol_30', 'rsi_14', 'return_1d_lag1']
weekday     = ['day_of_week']
target      = 'target'

# Preprocessing: one-hot encode weekday, passthrough price_feats
preprocessor = ColumnTransformer([
    ('onehot_weekday', OneHotEncoder(drop='first', sparse_output=False), weekday)
], remainder='passthrough')  # price_feats unchanged

# Build pipeline with RandomForest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier',   RandomForestClassifier(
                         n_estimators=100,
                         random_state=42,
                         n_jobs=-1
                     ))
])

# Fit on training set
X_train = train[price_feats + weekday]
y_train = train[target]
pipeline.fit(X_train, y_train)

# Evaluate on test set
X_test  = test[price_feats + weekday]
y_test  = test[target]
y_pred  = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Print performance metrics
print("\nRandom Forest (+ return_1d_lag1) Performance on Test Set:")
print(f"  Accuracy      : {accuracy_score(y_test, y_pred):.4f}")
print(f"  Precision     : {precision_score(y_test, y_pred):.4f}")
print(f"  Recall        : {recall_score(y_test, y_pred):.4f}")
print(f"  F1 Score      : {f1_score(y_test, y_pred):.4f}")
print(f"  ROC AUC       : {roc_auc_score(y_test, y_proba):.4f}")
print("  Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Train: 3636876 rows | Val: 644115 rows | Test: 571664 rows

Random Forest (+ return_1d_lag1) Performance on Test Set:
  Accuracy      : 0.5089
  Precision     : 0.5121
  Recall        : 0.5490
  F1 Score      : 0.5299
  ROC AUC       : 0.5131
  Confusion Matrix:
[[132719 150723]
 [130001 158221]]
