In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import warnings

warnings.filterwarnings("ignore")

# Load & sort data
file_path = '/content/drive/MyDrive/MRP/final_dataset.csv'
df = pd.read_csv(file_path, parse_dates=['date'])
df = df.sort_values(['symbol','date']).reset_index(drop=True)

# Create lagged return feature and drop NaNs
df['return_1d_lag1'] = df.groupby('symbol')['return_1d'].shift(1)
df = df.dropna(subset=['return_1d_lag1']).reset_index(drop=True)

# Chronological train/val/test split
train = df[df['date'] <= '2021-12-31'].copy()
val   = df[(df['date'] > '2021-12-31') & (df['date'] <= '2022-12-31')].copy()
test  = df[df['date'] > '2022-12-31'].copy()

print(f"Train: {train.shape[0]} rows | Val: {val.shape[0]} rows | Test: {test.shape[0]} rows")

# Define feature sets
price_feats = ['adj close', 'log_volume', 'ma_10', 'vol_30', 'rsi_14', 'return_1d_lag1']
news_feats  = ['avg_sentiment', 'avg_sentiment_confidence', 'sentiment_std_7']
weekday     = ['day_of_week']
target      = 'target'

# Preprocessor (one-hot weekday, passthrough everything else)
preprocessor = ColumnTransformer([
    ('onehot_wd', OneHotEncoder(drop='first', sparse_output=False), weekday)
], remainder='passthrough')  # price_feats + news_feats untouched

# Build LightGBM pipeline
lgbm = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf',          lgbm)
])

# Train on the train split
X_train = train[price_feats + news_feats + weekday]
y_train = train[target]
pipeline.fit(X_train, y_train)

# Evaluate on the test split
X_test  = test[price_feats + news_feats + weekday]
y_test  = test[target]
y_pred  = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Print performance metrics
print("\nLightGBM Performance on Test Set (with return_1d_lag1):")
print(f"  Accuracy      : {accuracy_score(y_test, y_pred):.4f}")
print(f"  Precision     : {precision_score(y_test, y_pred):.4f}")
print(f"  Recall        : {recall_score(y_test, y_pred):.4f}")
print(f"  F1 Score      : {f1_score(y_test, y_pred):.4f}")
print(f"  ROC AUC       : {roc_auc_score(y_test, y_proba):.4f}")
print("  Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Train: 3636876 rows | Val: 644115 rows | Test: 571664 rows
[LightGBM] [Info] Number of positive: 1862573, number of negative: 1774303
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.103165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2134
[LightGBM] [Info] Number of data points in the train set: 3636876, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.512135 -> initscore=0.048551
[LightGBM] [Info] Start training from score 0.048551

LightGBM Performance on Test Set (with return_1d_lag1):
  Accuracy      : 0.5220
  Precision     : 0.5178
  Recall        : 0.7560
  F1 Score      : 0.6146
  ROC AUC       : 0.5244
  Confusion Matrix:
[[ 80523 202919]
 [ 70328 217894]]
