In [136]:
from pathlib import Path
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import json

In [137]:
# --- Safe path for GitHub or Streamlit deployment ---
BASE_DIR = Path().resolve().parent  # adjust `.parent` depending on notebook location
DATA_PATH = BASE_DIR / "data"
MODEL_PATH = BASE_DIR / "models"
DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)

In [138]:
# Step 1: Load Engineered dataset
df= pd.read_csv(DATA_PATH/"btc_feature.csv", parse_dates=['timestamp'], index_col= 'timestamp')

# Create target variable (binary classification)
# If next hour's price > current, target = 1 else 0
df['target']= (df['future_price'] > df['price']).astype(int)

df.tail()

Unnamed: 0_level_0,price,return_1h,rolling_mean_3h,rolling_mean_6h,rolling_std_3h,future_price,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-06-14 10:00:00,104657.304688,0.001559,104672.854167,104853.514323,186.701119,104915.703125,1
2025-06-14 11:00:00,104915.703125,0.002469,104689.140625,104828.365885,212.441201,104736.453125,0
2025-06-14 12:00:00,104736.453125,-0.001709,104769.820312,104772.953125,132.391333,104908.320312,1
2025-06-14 13:00:00,104908.320312,0.001641,104853.492188,104763.173177,101.425998,105445.15625,1
2025-06-14 14:00:00,105445.15625,0.005117,105029.976562,104859.558594,369.682583,105513.789062,1


In [139]:
# Step 2: Define Features & target
features = ['return_1h', 'rolling_mean_3h', 'rolling_mean_6h', 'rolling_std_3h']
X = df[features]
y = df['target']

In [140]:
# Step 3: Train/test split(80/20)
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle= False, test_size= 0.2)

## Logistic Regression

In [141]:
# Step 4: Train LogisticRegression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [142]:
# Step 5: Predict
df['prediction'] = model.predict(X)

In [143]:
# Step 6: Save prediction
df.to_csv(MODEL_PATH / "btc_logreg_model.csv")

In [144]:
# Step 7: Evaluate performance
print("✅ Logistic Regression Model Trained. Evaluation::")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Logistic Regression Model Trained. Evaluation::
              precision    recall  f1-score   support

           0       0.50      0.67      0.57        15
           1       0.62      0.44      0.52        18

    accuracy                           0.55        33
   macro avg       0.56      0.56      0.54        33
weighted avg       0.56      0.55      0.54        33


Confusion Matrix:
[[10  5]
 [10  8]]


In [145]:
# Step 8: Save model (Optional)
joblib.dump(model, MODEL_PATH/'btc_logreg_model.pkl')

['/Users/badboihy/btc_ml_dashboard/models/btc_logreg_model.pkl']

## Random Forest Classifier

In [146]:
# Step 1: Train Random Forest
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [147]:
# Step 2: Predict and store
df['prediction'] = model.predict(X)

In [148]:
# Step 3: Save predictions
df.to_csv(MODEL_PATH/"btc_rf_model.csv")
joblib.dump(model, MODEL_PATH/'btc_rf_model.pkl')

['/Users/badboihy/btc_ml_dashboard/models/btc_rf_model.pkl']

In [149]:
# Step 4: Evaluation
print("✅ Random Forest Model Trained. Evaluation:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Random Forest Model Trained. Evaluation:
              precision    recall  f1-score   support

           0       0.67      0.27      0.38        15
           1       0.59      0.89      0.71        18

    accuracy                           0.61        33
   macro avg       0.63      0.58      0.55        33
weighted avg       0.63      0.61      0.56        33


Confusion Matrix:
[[ 4 11]
 [ 2 16]]


### Fine-Tune Random Forest

In [150]:
# Step 1: Define grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'class_weight': [None, 'balanced']
}

In [151]:
# Step 2: Grid search
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

In [152]:
# Step 3: Best model
model = grid.best_estimator_
df['prediction'] = model.predict(X)

In [153]:
# Step 4: Save results
df.to_csv(MODEL_PATH/"btc_rf_tuned_model.csv")
joblib.dump(model, MODEL_PATH/'btc_rf_tuned_model.pkl')

['/Users/badboihy/btc_ml_dashboard/models/btc_rf_tuned_model.pkl']

In [154]:
# Step 5: Evaluation
print("✅ Tuned RF Model. Best Params:")
print(grid.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Tuned RF Model. Best Params:
{'class_weight': 'balanced', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.27      0.38        15
           1       0.59      0.89      0.71        18

    accuracy                           0.61        33
   macro avg       0.63      0.58      0.55        33
weighted avg       0.63      0.61      0.56        33


Confusion Matrix:
[[ 4 11]
 [ 2 16]]


## XGBOOST

In [155]:
# Step 1: Train XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [156]:
# Step 2: Predict
df['prediction'] = model.predict(X)

In [157]:
# Step 3: Save output
df.to_csv(MODEL_PATH/"btc_xgb_model.csv")
joblib.dump(model, MODEL_PATH/'btc_xgb_model.pkl')

['/Users/badboihy/btc_ml_dashboard/models/btc_xgb_model.pkl']

In [158]:
# Step 4: Evaluate
print("✅ XGBoost Model Trained.")
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ XGBoost Model Trained.

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.33      0.42        15
           1       0.58      0.78      0.67        18

    accuracy                           0.58        33
   macro avg       0.57      0.56      0.54        33
weighted avg       0.57      0.58      0.55        33


Confusion Matrix:
[[ 5 10]
 [ 4 14]]


### Fine-Tuning XGBoost

In [159]:
# Step 1: Grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'scale_pos_weight': [1, float(sum(y == 0)) / sum(y == 1)]
}

grid = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [160]:
# Step 4: Best model
model = grid.best_estimator_
df['prediction'] = model.predict(X)

In [161]:
# Step 5: Save
df.to_csv(MODEL_PATH/"btc_xgb_tuned_model.csv")
joblib.dump(model, MODEL_PATH/"btc_xgb_tuned_model.pkl")

['/Users/badboihy/btc_ml_dashboard/models/btc_xgb_tuned_model.pkl']

In [162]:
# Step 6: Evaluate
print("✅ Tuned XGBoost Model")
print("Best Params:", grid.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Tuned XGBoost Model
Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'scale_pos_weight': 0.9397590361445783}

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.53      0.50        15
           1       0.56      0.50      0.53        18

    accuracy                           0.52        33
   macro avg       0.52      0.52      0.51        33
weighted avg       0.52      0.52      0.52        33


Confusion Matrix:
[[8 7]
 [9 9]]


In [163]:
with open(MODEL_PATH / 'xgb_best_params.json', 'w') as f:
    json.dump(grid.best_params_, f)
