# Testing Notebook for Logistic Regression Modeling

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LogisticRegression

import sys 
sys.path.append('..')
from src.DataFetcher import DataFetcher as DF
from src.TechnicalFeatures import TechnicalFeatures as TF

In [2]:
# Initialize 
# Fetch data using secrets.json file 
data_fetcher = DF(secrets_path='../secrets/secrets.json')

# Add technical features to the data (in dataframe)
tech_features = TF(symbols=data_fetcher.tickers, data_dir='../data/')
tech_features.populate_dfs()
tech_features.add_technical_features()

secrets.json file found at ../secrets/secrets.json. Beginning initialization of DataProcessor class.
Binance client initialized successfully.
All required fields loaded successfully from secrets.json.
5 tickers loaded successfully.
Frequency: Hourly
Starting date: 2020-04-10
Ending date: 2025-01-10
Base currency: USDT
Tickers: ['BTC', 'ETH', 'ADA', 'SOL', 'XRP']
Initialization of DataProcessor class completed successfully.


Fetching Crypto Data: 100%|[31m██████████████████████████████[0m| 5/5 [00:00<00:00]


Data for BTCUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/BTCUSDT.csv. Skipping download.
Data for ETHUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/ETHUSDT.csv. Skipping download.
Data for ADAUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/ADAUSDT.csv. Skipping download.
Data for SOLUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/SOLUSDT.csv. Skipping download.
Data for XRPUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/XRPUSDT.csv. Skipping download.
All Historical Data Fetched and Saved Successfully.


Fetching Test Crypto Data: 100%|[34m██████████████████████████████[0m| 5/5 [00:00<00:00]

Testing Data for BTCUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/BTCUSDT_TESTING.csv. Skipping download.
Testing Data for ETHUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/ETHUSDT_TESTING.csv. Skipping download.
Testing Data for ADAUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/ADAUSDT_TESTING.csv. Skipping download.
Testing Data for SOLUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/SOLUSDT_TESTING.csv. Skipping download.
Testing Data for XRPUSDT already exists at /Users/devrajkatkoria/Documents/AlphaPort/data/XRPUSDT_TESTING.csv. Skipping download.
Live Historical Data Fetched and Saved Successfully.





Technical features added for all symbols.


In [3]:
# Print Symbols 
print(f"Symbols: {tech_features.dfs.keys()}")

Symbols: dict_keys(['BTCUSDT', 'ETHUSDT', 'ADAUSDT', 'SOLUSDT', 'XRPUSDT'])


In [4]:
# Print preview of BTCUSDT data
tech_features.dfs['BTCUSDT'].head()

Unnamed: 0_level_0,Open,Close,Volume,Low,High,Log_Return,SMA_12,EMA_12,RSI_14,Volatility_12,ATR_12,OBV,MACD_HIST,bb_upper,bb_lower,Forward_Return,Label
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-04-10 19:00:00,6913.49,6900.31,3843.491265,6853.9,6948.0,-0.274629,-1.496735,-1.494963,-0.444699,0.284129,-0.941011,-1.739723,0.038392,-1.485558,-1.499106,0.201321,1
2020-04-10 20:00:00,6900.13,6909.99,1795.505006,6874.73,6919.9,0.201321,-1.496768,-1.494987,-0.393062,0.288304,-0.953283,-1.735546,0.089497,-1.488173,-1.498257,-0.347668,0
2020-04-10 21:00:00,6909.99,6894.43,2047.355175,6878.05,6931.93,-0.347668,-1.496879,-1.495121,-0.273229,0.294648,-0.961259,-1.740309,0.115767,-1.490873,-1.497246,-0.870157,0
2020-04-10 22:00:00,6894.28,6855.01,2131.689652,6822.8,6916.41,-0.870157,-1.497196,-1.495525,-0.49026,0.330858,-0.951756,-1.745268,0.10856,-1.494089,-1.49584,0.076449,1
2020-04-10 23:00:00,6855.41,6858.92,2002.468602,6818.88,6875.27,0.076449,-1.497353,-1.495838,-0.492095,0.296766,-0.948368,-1.740609,0.110052,-1.496825,-1.494545,0.9686,1


## Baseline
Attempt logistic regression fit on BTC 

In [5]:
sym = 'BTCUSDT'
df = tech_features.dfs[sym].copy()

# Drop columns we do not want as features
drop_cols = ["Forward_Return", "Label"]
feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols].values
y = df['Label'].astype(int).values

# 5-fold time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

In [6]:
# Logistic Regression Model
model = LogisticRegression(
    solver="saga",
    penalty="l2",
    C=1.0,
    max_iter=1000,
    n_jobs=-1
)

In [7]:
# Accuracy score using cross-validation
scores = cross_val_score(
    model, X, y,
    cv=tscv,
    scoring="roc_auc",
    n_jobs=-1
)



In [8]:
print("BTC Direction ROC-AUC by fold:", scores)
print("Mean OOS ROC-AUC:", scores.mean())

BTC Direction ROC-AUC by fold: [0.53505872 0.52129166 0.54720056 0.54862079 0.5532616 ]
Mean OOS ROC-AUC: 0.5410866663772932


## Hyperparameter Tuning & Model Comparison

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

In [10]:
f1_macro = make_scorer(f1_score, average='macro')

In [11]:
# Create Logistic Regression Grid 
lr = LogisticRegression(solver="saga", max_iter=1000)
lr_params = {"C":[0.01,0.1,1,10], "penalty":["l1","l2"]}
lr_grid = GridSearchCV(lr, lr_params, cv=tscv, scoring=f1_macro, n_jobs=-1)
lr_grid.fit(X, y)
print("LR best:", lr_grid.best_params_, "F1:", lr_grid.best_score_)



LR best: {'C': 0.01, 'penalty': 'l2'} F1: 0.4925747403232066




In [12]:
# Create Random Forest Grid 
rf = RandomForestClassifier(n_jobs=-1)
rf_params = {
    "n_estimators":[50,100,200],
    "max_depth":[3,5,10,None],
    "min_samples_leaf":[1,5,10]
}
rf_grid = GridSearchCV(rf, rf_params, cv=tscv, scoring=f1_macro, n_jobs=-1)
rf_grid.fit(X, y)
print("RF best:", rf_grid.best_params_, "F1:", rf_grid.best_score_)

RF best: {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 50} F1: 0.5153579004567934


In [13]:
# Create XGBoost Grid
xgb = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    n_jobs=-1
)
xgb_params = {
    "n_estimators":[50,100,200],
    "max_depth":[3,5,7],
    "learning_rate":[0.01,0.1,0.2]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=tscv, scoring=f1_macro, n_jobs=-1)
xgb_grid.fit(X, y)
print("XGB best:", xgb_grid.best_params_, "F1:", xgb_grid.best_score_)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGB best: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} F1: 0.5249192642310583


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
best_model = xgb_grid.best_estimator_
importance = best_model.feature_importances_
feat_imp = sorted(zip(feature_cols, importance), key=lambda x: x[1], reverse=True)
print("Feature Importances:")
for feat, imp in feat_imp:
    print(f"{feat}: {imp:.4f}")

Feature Importances:
Log_Return: 0.2234
RSI_14: 0.1044
MACD_HIST: 0.0921
Low: 0.0631
ATR_12: 0.0584
OBV: 0.0570
bb_upper: 0.0565
Close: 0.0555
Volatility_12: 0.0542
EMA_12: 0.0503
Volume: 0.0482
High: 0.0459
bb_lower: 0.0459
Open: 0.0450
SMA_12: 0.0000
