# Crypto Regime Classifier - Exploratory Data Analysis

This notebook explores the OHLCV data and examines feature distributions across different market regimes.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.features import FeatureExtractor
from src.labeling import RegimeLabeler, RegimeType
from src.utils.data import load_ohlcv, validate_data

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Update path to your data file
DATA_PATH = "../data/BTC.csv"

try:
    df = load_ohlcv(DATA_PATH)
    print(f"Loaded {len(df)} rows")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
except FileNotFoundError:
    print(f"Data file not found at {DATA_PATH}")
    print("Please update DATA_PATH to point to your OHLCV data")
    # Create sample data for demonstration
    np.random.seed(42)
    dates = pd.date_range('2020-01-01', periods=1000, freq='D')
    price = 10000 * np.exp(np.cumsum(np.random.randn(1000) * 0.02))
    df = pd.DataFrame({
        'open': price * (1 + np.random.randn(1000) * 0.01),
        'high': price * (1 + np.abs(np.random.randn(1000)) * 0.02),
        'low': price * (1 - np.abs(np.random.randn(1000)) * 0.02),
        'close': price,
        'volume': np.random.lognormal(20, 1, 1000)
    }, index=dates)
    print("Using synthetic data for demonstration")

In [None]:
# Validate data
validation = validate_data(df)
print("Data Validation:")
print(f"  Total rows: {validation['total_rows']}")
print(f"  Missing values: {validation['missing_values']}")
if validation['issues']:
    print(f"  Issues: {validation['issues']}")
else:
    print("  No issues found")

In [None]:
df.head()

## 2. Price Overview

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Price
axes[0].plot(df.index, df['close'], label='Close Price')
axes[0].set_ylabel('Price')
axes[0].set_title('Price History')
axes[0].legend()

# Volume
axes[1].bar(df.index, df['volume'], alpha=0.7)
axes[1].set_ylabel('Volume')
axes[1].set_title('Volume History')

plt.tight_layout()
plt.show()

## 3. Extract Features

In [None]:
extractor = FeatureExtractor()
features = extractor.transform(df)

print(f"Extracted {len(features.columns)} features from {len(features)} samples")
print(f"\nFeature names:")
for i, name in enumerate(features.columns, 1):
    print(f"  {i:2d}. {name}")

In [None]:
features.describe()

## 4. Generate Regime Labels

In [None]:
labeler = RegimeLabeler(
    trend_threshold=0.02,
    vol_percentile=80
)

labels = labeler.label(df)
stats = labeler.get_regime_stats(labels)

print("Regime Distribution:")
for regime, count in stats['regime_counts'].items():
    pct = stats['regime_percentages'][regime]
    avg_dur = stats['avg_duration'].get(regime, 0)
    print(f"  {regime}: {count} samples ({pct:.1f}%), avg duration: {avg_dur:.1f} days")

In [None]:
# Visualize regimes on price chart
regime_colors = {
    'BULL_TREND': 'green',
    'BEAR_TREND': 'red',
    'SIDEWAYS': 'gray',
    'HIGH_VOL': 'orange'
}

fig, ax = plt.subplots(figsize=(14, 6))

# Plot price
ax.plot(df.index, df['close'], color='black', alpha=0.5, linewidth=0.5)

# Color background by regime
for regime in RegimeType:
    mask = labels == regime.value
    if mask.any():
        ax.fill_between(
            df.index, 
            df['close'].min(), 
            df['close'].max(),
            where=mask.reindex(df.index, fill_value=False),
            alpha=0.3,
            color=regime_colors[regime.value],
            label=regime.value
        )

ax.set_title('Price with Regime Labels')
ax.set_ylabel('Price')
ax.legend(loc='upper left')
plt.tight_layout()
plt.show()

## 5. Feature Distribution by Regime

In [None]:
# Align features and labels
common_idx = features.index.intersection(labels.index)
features_aligned = features.loc[common_idx]
labels_aligned = labels.loc[common_idx]

# Key features to visualize
key_features = ['return_20d', 'volatility', 'rsi', 'ma_alignment', 'atr_percentile', 'trend_strength']
key_features = [f for f in key_features if f in features_aligned.columns]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    ax = axes[i]
    for regime in RegimeType:
        mask = labels_aligned == regime.value
        if mask.any():
            data = features_aligned.loc[mask, feature].dropna()
            ax.hist(data, bins=30, alpha=0.5, label=regime.value, color=regime_colors[regime.value])
    ax.set_title(feature)
    ax.legend()

plt.tight_layout()
plt.show()

## 6. Feature Correlations

In [None]:
# Select numeric features for correlation
numeric_features = features_aligned.select_dtypes(include=[np.number])

# Compute correlation matrix
corr_matrix = numeric_features.corr()

# Plot heatmap
fig, ax = plt.subplots(figsize=(16, 14))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
    corr_matrix, 
    mask=mask, 
    annot=False, 
    cmap='RdBu_r', 
    center=0,
    square=True,
    ax=ax
)
ax.set_title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 7. Regime Transitions

In [None]:
# Calculate transition matrix
transitions = pd.crosstab(
    labels_aligned.shift(1).dropna(),
    labels_aligned.loc[labels_aligned.shift(1).dropna().index],
    normalize='index'
)

print("Regime Transition Probabilities:")
print(transitions.round(3))

In [None]:
# Visualize transition matrix
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
    transitions, 
    annot=True, 
    fmt='.2f', 
    cmap='Blues',
    ax=ax
)
ax.set_title('Regime Transition Probabilities')
ax.set_xlabel('To Regime')
ax.set_ylabel('From Regime')
plt.tight_layout()
plt.show()

## 8. Summary Statistics by Regime

In [None]:
# Combine features with labels
df_analysis = features_aligned.copy()
df_analysis['regime'] = labels_aligned

# Summary statistics
summary = df_analysis.groupby('regime').agg(['mean', 'std', 'median']).T
summary