In [88]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [119]:
# Step 1: Load Engineered dataset
df= pd.read_csv("btc_feature.csv", parse_dates=['timestamp'], index_col= 'timestamp')

# Create target variable (binary classification)
# If next hour's price > current, target = 1 else 0
df['target']= (df['future_price'] > df['price']).astype(int)

df.tail()

Unnamed: 0_level_0,price,return_1h,rolling_mean_3h,rolling_mean_6h,rolling_std_3h,future_price,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-06-12 01:00:00,107724.859375,0.001052,107754.84375,107768.661458,160.250689,107529.992188,0
2025-06-12 02:00:00,107529.992188,-0.001809,107622.184896,107711.768229,97.855536,107310.179688,0
2025-06-12 03:00:00,107310.179688,-0.002044,107521.677083,107640.934896,207.464856,106926.75,0
2025-06-12 04:00:00,106926.75,-0.003573,107255.640625,107505.242188,305.296849,106977.46875,1
2025-06-12 05:00:00,106977.46875,0.000474,107071.466146,107346.825521,208.281574,106839.140625,0


In [120]:
# Step 2: Define Features & target
features = ['return_1h', 'rolling_mean_3h', 'rolling_mean_6h', 'rolling_std_3h']
X = df[features]
y = df['target']

In [121]:
# Step 3: Train/test split(80/20)
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle= False, test_size= 0.2)

## Logistic Regression

In [122]:
# Step 4: Train LogisticRegression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [123]:
# Step 5: Predict
df['prediction'] = model.predict(X)

In [124]:
# Step 6: Save prediction
df.to_csv("btc_model_output_logreg.csv")

In [144]:
# Step 7: Evaluate performance
print("✅ Logistic Regression Model Trained. Evaluation::")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Logistic Regression Model Trained. Evaluation::
              precision    recall  f1-score   support

           0       0.67      0.21      0.32        19
           1       0.40      0.83      0.54        12

    accuracy                           0.45        31
   macro avg       0.53      0.52      0.43        31
weighted avg       0.56      0.45      0.41        31


Confusion Matrix:
[[ 4 15]
 [ 2 10]]


In [126]:
# Step 8: Save model (Optional)
joblib.dump(model, 'btc_price_model.pkl')

['btc_price_model.pkl']

## Random Forest Classifier

In [127]:
# Step 1: Train Random Forest
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [128]:
# Step 2: Predict and store
df['prediction'] = model.predict(X)

In [129]:
# Step 3: Save predictions
df.to_csv("btc_model_output_rf.csv")
joblib.dump(model, 'btc_rf_model.pkl')

['btc_rf_model.pkl']

In [130]:
# Step 4: Evaluation
print("✅ Random Forest Model Trained. Evaluation:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Random Forest Model Trained. Evaluation:
              precision    recall  f1-score   support

           0       0.60      0.32      0.41        19
           1       0.38      0.67      0.48        12

    accuracy                           0.45        31
   macro avg       0.49      0.49      0.45        31
weighted avg       0.52      0.45      0.44        31


Confusion Matrix:
[[ 6 13]
 [ 4  8]]


### Fine-Tune Random Forest

In [131]:
# Step 1: Define grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'class_weight': [None, 'balanced']
}

In [132]:
# Step 2: Grid search
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)



In [133]:
# Step 3: Best model
model = grid.best_estimator_
df['prediction'] = model.predict(X)

In [134]:
# Step 4: Save results
df.to_csv("btc_model_output_rf_tuned.csv")
joblib.dump(model, 'btc_rf_tuned.pkl')

['btc_rf_tuned.pkl']

In [135]:
# Step 5: Evaluation
print("✅ Tuned RF Model. Best Params:")
print(grid.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Tuned RF Model. Best Params:
{'class_weight': None, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 50}

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.37      0.45        19
           1       0.37      0.58      0.45        12

    accuracy                           0.45        31
   macro avg       0.48      0.48      0.45        31
weighted avg       0.50      0.45      0.45        31


Confusion Matrix:
[[ 7 12]
 [ 5  7]]


## XGBOOST

In [136]:
# Step 1: Train XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [137]:
# Step 2: Predict
df['prediction'] = model.predict(X)

In [138]:
# Step 3: Save output
df.to_csv("btc_model_output_xgb.csv")
joblib.dump(model, 'btc_xgb_model.pkl')

['btc_xgb_model.pkl']

In [139]:
# Step 4: Evaluate
print("✅ XGBoost Model Trained.")
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ XGBoost Model Trained.

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.37      0.45        19
           1       0.37      0.58      0.45        12

    accuracy                           0.45        31
   macro avg       0.48      0.48      0.45        31
weighted avg       0.50      0.45      0.45        31


Confusion Matrix:
[[ 7 12]
 [ 5  7]]


### Fine-Tuning XGBoost

In [140]:
# Step 1: Grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'scale_pos_weight': [1, float(sum(y == 0)) / sum(y == 1)]
}

grid = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [141]:
# Step 4: Best model
model = grid.best_estimator_
df['prediction'] = model.predict(X)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [142]:
# Step 5: Save
df.to_csv("btc_model_output_xgb_tuned.csv")
joblib.dump(model, "btc_xgb_tuned.pkl")

['btc_xgb_tuned.pkl']

In [143]:
# Step 6: Evaluate
print("✅ Tuned XGBoost Model")
print("Best Params:", grid.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test)))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))

✅ Tuned XGBoost Model
Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'scale_pos_weight': 1}

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.21      0.32        19
           1       0.40      0.83      0.54        12

    accuracy                           0.45        31
   macro avg       0.53      0.52      0.43        31
weighted avg       0.56      0.45      0.41        31


Confusion Matrix:
[[ 4 15]
 [ 2 10]]
