# Crypto Regime Classifier - Model Evaluation

This notebook evaluates trained regime classification models.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    f1_score,
    accuracy_score,
    precision_recall_fscore_support
)

from src.features import FeatureExtractor
from src.labeling import RegimeLabeler, RegimeType
from src.models import RegimeClassifier, EnsembleClassifier
from src.utils.data import load_ohlcv, prepare_training_data

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Data and Prepare Features

In [None]:
# Update path to your data file
DATA_PATH = "../data/BTC.csv"

try:
    df = load_ohlcv(DATA_PATH)
    print(f"Loaded {len(df)} rows")
except FileNotFoundError:
    print(f"Data file not found at {DATA_PATH}")
    # Create sample data for demonstration
    np.random.seed(42)
    dates = pd.date_range('2020-01-01', periods=1000, freq='D')
    price = 10000 * np.exp(np.cumsum(np.random.randn(1000) * 0.02))
    df = pd.DataFrame({
        'open': price * (1 + np.random.randn(1000) * 0.01),
        'high': price * (1 + np.abs(np.random.randn(1000)) * 0.02),
        'low': price * (1 - np.abs(np.random.randn(1000)) * 0.02),
        'close': price,
        'volume': np.random.lognormal(20, 1, 1000)
    }, index=dates)
    print("Using synthetic data for demonstration")

In [None]:
# Initialize components
extractor = FeatureExtractor()
labeler = RegimeLabeler(trend_threshold=0.02, vol_percentile=80)

# Prepare data
X_train, X_test, y_train, y_test = prepare_training_data(
    df=df,
    feature_extractor=extractor,
    labeler=labeler,
    test_size=0.2
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 2. Train and Evaluate Models

In [None]:
# Train multiple models
models = {
    'Random Forest': RegimeClassifier(model_type='random_forest'),
    'Gradient Boosting': RegimeClassifier(model_type='gradient_boosting'),
    'Logistic Regression': RegimeClassifier(model_type='logistic'),
}

results = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}")
    print('='*50)
    
    model.fit(X_train, y_train, verbose=False)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    results[name] = {
        'model': model,
        'accuracy': acc,
        'f1_macro': f1,
        'predictions': y_pred
    }
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 (macro): {f1:.4f}")

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    name: {'Accuracy': r['accuracy'], 'F1 (macro)': r['f1_macro']}
    for name, r in results.items()
}).T

print("Model Comparison:")
print(comparison_df.round(4))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 5))

x = np.arange(len(comparison_df))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Accuracy'], width, label='Accuracy')
bars2 = ax.bar(x + width/2, comparison_df['F1 (macro)'], width, label='F1 (macro)')

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df.index)
ax.legend()
ax.set_ylim(0, 1)

# Add value labels
for bar in bars1 + bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 3. Confusion Matrix Analysis

In [None]:
# Best model confusion matrix
best_model_name = max(results, key=lambda x: results[x]['f1_macro'])
best_result = results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"\nClassification Report:")
print(classification_report(y_test, best_result['predictions']))

In [None]:
# Confusion matrix heatmap
cm = confusion_matrix(y_test, best_result['predictions'])
labels = sorted(y_test.unique())

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels, ax=axes[0])
axes[0].set_title(f'{best_model_name} - Confusion Matrix (Counts)')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Normalized
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=labels, yticklabels=labels, ax=axes[1])
axes[1].set_title(f'{best_model_name} - Confusion Matrix (Normalized)')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## 4. Feature Importance

In [None]:
# Feature importance for Random Forest
rf_model = results['Random Forest']['model']
importance = rf_model.get_feature_importance()

# Plot top 20 features
top_n = 20
top_features = importance.head(top_n)

fig, ax = plt.subplots(figsize=(10, 8))
top_features.plot(kind='barh', ax=ax)
ax.set_xlabel('Importance')
ax.set_title(f'Top {top_n} Feature Importance (Random Forest)')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Regime Prediction Stability

In [None]:
# Check regime persistence
predictions = best_result['predictions']

# Calculate regime switches
switches = (predictions != predictions.shift(1)).sum()
total_days = len(predictions)
switch_rate = switches / total_days

print(f"Regime switches: {switches}")
print(f"Total days: {total_days}")
print(f"Switch rate: {switch_rate:.2%}")
print(f"Average regime duration: {1/switch_rate:.1f} days")

In [None]:
# Visualize predictions vs actual
regime_colors = {
    'BULL_TREND': 'green',
    'BEAR_TREND': 'red',
    'SIDEWAYS': 'gray',
    'HIGH_VOL': 'orange'
}

# Get test period price data
test_prices = df.loc[y_test.index, 'close']

fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Price
axes[0].plot(test_prices.index, test_prices.values, color='black')
axes[0].set_ylabel('Price')
axes[0].set_title('Test Period Price')

# Actual regimes
for regime in RegimeType:
    mask = y_test == regime.value
    if mask.any():
        axes[1].fill_between(
            y_test.index, 0, 1,
            where=mask,
            alpha=0.7,
            color=regime_colors[regime.value],
            label=regime.value
        )
axes[1].set_ylabel('Actual')
axes[1].legend(loc='upper right', ncol=4)

# Predicted regimes
for regime in RegimeType:
    mask = predictions == regime.value
    if mask.any():
        axes[2].fill_between(
            predictions.index, 0, 1,
            where=mask,
            alpha=0.7,
            color=regime_colors[regime.value],
            label=regime.value
        )
axes[2].set_ylabel('Predicted')
axes[2].legend(loc='upper right', ncol=4)

plt.tight_layout()
plt.show()

## 6. Ensemble Model

In [None]:
# Train ensemble
ensemble = EnsembleClassifier(
    model_types=['random_forest', 'gradient_boosting'],
    voting='soft'
)

ensemble.fit(X_train, y_train, verbose=False)

# Evaluate
y_pred_ensemble = ensemble.predict(X_test)

acc = accuracy_score(y_test, y_pred_ensemble)
f1 = f1_score(y_test, y_pred_ensemble, average='macro')

print(f"Ensemble Performance:")
print(f"  Accuracy: {acc:.4f}")
print(f"  F1 (macro): {f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_ensemble))

In [None]:
# Model agreement
agreements = ensemble.get_model_agreements(X_test)

# Calculate agreement rate
agreement_rate = (agreements.iloc[:, 0] == agreements.iloc[:, 1]).mean()
print(f"Model agreement rate: {agreement_rate:.2%}")

## 7. Save Best Model

In [None]:
# Save the best performing model
import os

output_dir = '../models/'
os.makedirs(output_dir, exist_ok=True)

best_model = results[best_model_name]['model']
model_path = f'{output_dir}regime_classifier_best.pkl'

best_model.save(model_path)
print(f"Saved best model ({best_model_name}) to {model_path}")

## 8. Summary

In [None]:
print("="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"\nData: {len(df)} total samples")
print(f"  Training: {len(X_train)}")
print(f"  Test: {len(X_test)}")
print(f"\nBest Model: {best_model_name}")
print(f"  Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"  F1 (macro): {results[best_model_name]['f1_macro']:.4f}")
print(f"\nEnsemble Performance:")
print(f"  Accuracy: {acc:.4f}")
print(f"  F1 (macro): {f1:.4f}")
print(f"\nTarget Metrics:")
print(f"  Accuracy > 60%: {'PASS' if results[best_model_name]['accuracy'] > 0.6 else 'FAIL'}")
print(f"  F1 > 0.55: {'PASS' if results[best_model_name]['f1_macro'] > 0.55 else 'FAIL'}")